Commit 741a6a69 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into mela

parents 494a4515 b536f067
...@@ -11,19 +11,25 @@ import torch ...@@ -11,19 +11,25 @@ import torch
import lm_eval.api.metrics import lm_eval.api.metrics
import lm_eval.api.registry import lm_eval.api.registry
import lm_eval.api.task
import lm_eval.models import lm_eval.models
from lm_eval.caching.cache import delete_cache from lm_eval.caching.cache import delete_cache
from lm_eval.evaluator_utils import ( from lm_eval.evaluator_utils import (
consolidate_group_results,
consolidate_results, consolidate_results,
get_sample_size, get_sample_size,
get_subtask_list,
get_task_list, get_task_list,
prepare_print_tasks, prepare_print_tasks,
print_writeout, print_writeout,
run_task_tests, run_task_tests,
) )
from lm_eval.loggers import EvaluationTracker from lm_eval.loggers import EvaluationTracker
from lm_eval.loggers.utils import add_env_info, get_git_commit_hash from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
from lm_eval.tasks import TaskManager, get_task_dict from lm_eval.tasks import (
TaskManager,
get_task_dict,
)
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger, eval_logger,
handle_non_serializable, handle_non_serializable,
...@@ -35,7 +41,7 @@ from lm_eval.utils import ( ...@@ -35,7 +41,7 @@ from lm_eval.utils import (
if TYPE_CHECKING: if TYPE_CHECKING:
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.tasks import Task from lm_eval.api.task import Task
@positional_deprecated @positional_deprecated
...@@ -44,7 +50,7 @@ def simple_evaluate( ...@@ -44,7 +50,7 @@ def simple_evaluate(
model_args: Optional[Union[str, dict]] = None, model_args: Optional[Union[str, dict]] = None,
tasks: Optional[List[Union[str, dict, object]]] = None, tasks: Optional[List[Union[str, dict, object]]] = None,
num_fewshot: Optional[int] = None, num_fewshot: Optional[int] = None,
batch_size: Optional[int] = None, batch_size: Optional[Union[int, str]] = None,
max_batch_size: Optional[int] = None, max_batch_size: Optional[int] = None,
device: Optional[str] = None, device: Optional[str] = None,
use_cache: Optional[str] = None, use_cache: Optional[str] = None,
...@@ -58,7 +64,7 @@ def simple_evaluate( ...@@ -58,7 +64,7 @@ def simple_evaluate(
log_samples: bool = True, log_samples: bool = True,
evaluation_tracker: Optional[EvaluationTracker] = None, evaluation_tracker: Optional[EvaluationTracker] = None,
system_instruction: Optional[str] = None, system_instruction: Optional[str] = None,
apply_chat_template: bool = False, apply_chat_template: Union[bool, str] = False,
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
gen_kwargs: Optional[str] = None, gen_kwargs: Optional[str] = None,
task_manager: Optional[TaskManager] = None, task_manager: Optional[TaskManager] = None,
...@@ -106,8 +112,11 @@ def simple_evaluate( ...@@ -106,8 +112,11 @@ def simple_evaluate(
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param system_instruction: str :param system_instruction: str
System instruction to be applied to the prompt System instruction to be applied to the prompt
:param apply_chat_template: bool :param apply_chat_template: Union[bool, str]
If True, apply chat template to the prompt Specifies whether to apply a chat template to the prompt.
- If set to True, the default chat template is applied.
- If set to a string, applies the specified chat template by name.
Defaults to False (no chat template applied).
:param fewshot_as_multiturn: bool :param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn. Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param gen_kwargs: str :param gen_kwargs: str
...@@ -219,48 +228,61 @@ def simple_evaluate( ...@@ -219,48 +228,61 @@ def simple_evaluate(
task_manager = TaskManager(verbosity) task_manager = TaskManager(verbosity)
task_dict = get_task_dict(tasks, task_manager) task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
if isinstance(task_obj, tuple):
_, task_obj = task_obj
if task_obj is None:
continue
if task_obj.get_config("output_type") == "generate_until":
if gen_kwargs is not None:
task_obj.set_config(
key="generation_kwargs", value=gen_kwargs, update=True
)
if predict_only: # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
log_samples = True # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
eval_logger.info( def _adjust_config(task_dict):
f"Processing {task_name} in output-only mode. Metrics will not be calculated!" adjusted_task_dict = {}
) for task_name, task_obj in task_dict.items():
# we have to change the class properties post-hoc. This is pretty hacky. if isinstance(task_obj, dict):
task_obj.override_metric(metric_name="bypass") adjusted_task_dict = {
**adjusted_task_dict,
**{task_name: _adjust_config(task_obj)},
}
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
else: else:
eval_logger.warning( if task_obj.get_config("output_type") == "generate_until":
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" if gen_kwargs is not None:
task_obj.set_config(
key="generation_kwargs", value=gen_kwargs, update=True
)
if predict_only:
eval_logger.info(
f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
)
# we have to change the class properties post-hoc. This is pretty hacky.
task_obj.override_metric(metric_name="bypass")
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
else:
eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (
default_num_fewshot := task_obj.get_config("num_fewshot")
) is None:
task_obj.set_config(key="num_fewshot", value=0)
# fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
eval_logger.info(
f"Setting fewshot random generator seed to {fewshot_random_seed}"
) )
task_obj.set_config(key="num_fewshot", value=num_fewshot)
else: adjusted_task_dict[task_name] = task_obj
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None: return adjusted_task_dict
task_obj.set_config(key="num_fewshot", value=0)
# fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file) task_dict = _adjust_config(task_dict)
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
eval_logger.info(
f"Setting fewshot random generator seed to {fewshot_random_seed}"
)
if check_integrity: if check_integrity:
run_task_tests(task_list=tasks) run_task_tests(task_list=tasks)
...@@ -270,7 +292,8 @@ def simple_evaluate( ...@@ -270,7 +292,8 @@ def simple_evaluate(
model_source=model, model_source=model,
model_args=model_args, model_args=model_args,
system_instruction=system_instruction, system_instruction=system_instruction,
chat_template=lm.chat_template if apply_chat_template else None, chat_template=lm.chat_template(apply_chat_template),
fewshot_as_multiturn=fewshot_as_multiturn,
) )
results = evaluate( results = evaluate(
...@@ -281,7 +304,7 @@ def simple_evaluate( ...@@ -281,7 +304,7 @@ def simple_evaluate(
rewrite_requests_cache=rewrite_requests_cache, rewrite_requests_cache=rewrite_requests_cache,
bootstrap_iters=bootstrap_iters, bootstrap_iters=bootstrap_iters,
write_out=write_out, write_out=write_out,
log_samples=log_samples, log_samples=True if predict_only else log_samples,
system_instruction=system_instruction, system_instruction=system_instruction,
apply_chat_template=apply_chat_template, apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn, fewshot_as_multiturn=fewshot_as_multiturn,
...@@ -325,6 +348,7 @@ def simple_evaluate( ...@@ -325,6 +348,7 @@ def simple_evaluate(
results["git_hash"] = get_git_commit_hash() results["git_hash"] = get_git_commit_hash()
results["date"] = start_date results["date"] = start_date
add_env_info(results) # additional environment info to results add_env_info(results) # additional environment info to results
add_tokenizer_info(results, lm) # additional info about tokenizer
return results return results
else: else:
return None return None
...@@ -341,7 +365,7 @@ def evaluate( ...@@ -341,7 +365,7 @@ def evaluate(
write_out: bool = False, write_out: bool = False,
log_samples: bool = True, log_samples: bool = True,
system_instruction: Optional[str] = None, system_instruction: Optional[str] = None,
apply_chat_template: bool = False, apply_chat_template: Union[bool, str] = False,
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
verbosity: str = "INFO", verbosity: str = "INFO",
): ):
...@@ -361,8 +385,11 @@ def evaluate( ...@@ -361,8 +385,11 @@ def evaluate(
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param system_instruction: str :param system_instruction: str
System instruction to be applied to the prompt System instruction to be applied to the prompt
:param apply_chat_template: bool :param apply_chat_template: Union[bool, str]
If True, apply chat template to the prompt Specifies whether to apply a chat template to the prompt.
- If set to True, the default chat template is applied.
- If set to a string, applies the specified chat template by name.
Defaults to False (no chat template applied).
:param fewshot_as_multiturn: bool :param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn. Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:return :return
...@@ -378,7 +405,7 @@ def evaluate( ...@@ -378,7 +405,7 @@ def evaluate(
padding_requests = defaultdict(int) padding_requests = defaultdict(int)
# get lists of group hierarchy and each type of request # get lists of group hierarchy and each type of request
task_hierarchy, eval_tasks = get_task_list(task_dict) eval_tasks = get_task_list(task_dict)
if not log_samples: if not log_samples:
if not all( if not all(
"bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
...@@ -395,9 +422,14 @@ def evaluate( ...@@ -395,9 +422,14 @@ def evaluate(
cache_requests=cache_requests, cache_requests=cache_requests,
rewrite_requests_cache=rewrite_requests_cache, rewrite_requests_cache=rewrite_requests_cache,
system_instruction=system_instruction, system_instruction=system_instruction,
apply_chat_template=apply_chat_template, apply_chat_template=bool(apply_chat_template),
fewshot_as_multiturn=fewshot_as_multiturn, fewshot_as_multiturn=fewshot_as_multiturn,
lm=lm, chat_template=getattr(lm, "apply_chat_template")
if apply_chat_template
else None,
tokenizer_name=getattr(lm, "tokenizer_name", "")
if apply_chat_template
else "",
) )
eval_logger.debug( eval_logger.debug(
f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}" f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
...@@ -550,106 +582,45 @@ def evaluate( ...@@ -550,106 +582,45 @@ def evaluate(
### Calculate group metrics ### ### Calculate group metrics ###
if bool(results): if bool(results):
for group, task_list in reversed(task_hierarchy.items()): results, versions, show_group_table, *_ = consolidate_group_results(
if len(task_list) == 0: results, versions, task_dict
# task_hierarchy entries are either )
# `group_name: [subtask1, subtask2, ...]`
# or `task_name: []`. results_agg, group_agg = prepare_print_tasks(task_dict, results)
# we only want to operate on groups here. subtask_list = get_subtask_list(task_dict)
continue
# collect all higher_is_better values for metrics
# collect all higher_is_better values for metrics # in the group's subtasks.
# in the group's subtasks. # TODO: clean this up ; unify with the below metric_list loop?
# TODO: clean this up ; unify with the below metric_list loop? _higher_is_better = {}
_higher_is_better = {} for group, task_list in subtask_list.items():
if (
len(task_list) != 0
): # subtask list will list "task_name": [] for solo tasks
for task in task_list: for task in task_list:
for m, h in higher_is_better[task].items(): for m, h in higher_is_better[task].items():
if m not in _higher_is_better.keys(): if m not in _higher_is_better.keys():
_higher_is_better[m] = h _higher_is_better[m] = h
if (
m in _higher_is_better
and _higher_is_better[m] is not None
and _higher_is_better[m] != h
):
eval_logger.warning(
f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
)
_higher_is_better[m] = None
higher_is_better[group] = _higher_is_better
# collect all metric keys used by a subtask in the group. if (
metric_list = list( m in _higher_is_better
{ and _higher_is_better[m] is not None
key and _higher_is_better[m] != h
for task in task_list ):
for key in results[task].keys() eval_logger.warning(
if "_stderr" not in key and key not in ["alias", "samples"] f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
} )
) _higher_is_better[m] = None
for metric in metric_list: higher_is_better[group] = _higher_is_better
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
# compute group's pooled metric and stderr
results[group][
metric
] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][
stderr
] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results[group]["samples"] = sum(sizes)
results_agg = defaultdict(dict)
groups_agg = defaultdict(dict)
all_tasks_list = list(task_hierarchy.keys())
while True:
add_tasks_list = list(k for k in results_agg.keys())
left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
if len(left_tasks_list) == 0:
break
_task_hierarchy = {
k: v for k, v in task_hierarchy.items() if k in left_tasks_list
}
_results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}
for group_name, task_list in task_hierarchy.items():
if task_list:
num_fewshot[group_name] = num_fewshot[
task_list[0]
] # TODO: validate this
results_dict = { results_dict = {
"results": dict(results_agg.items()), "results": dict(results_agg.items()),
**({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}), **(
"group_subtasks": dict(reversed(task_hierarchy.items())), {"groups": dict(group_agg.items())}
if (bool(group_agg) & show_group_table)
else {}
),
"group_subtasks": dict(reversed(subtask_list.items())),
"configs": dict(sorted(configs.items())), "configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())), "versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())), "n-shot": dict(sorted(num_fewshot.items())),
......
...@@ -2,9 +2,15 @@ import collections ...@@ -2,9 +2,15 @@ import collections
import math import math
import pathlib import pathlib
import sys import sys
from typing import Dict, List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
from lm_eval.api import metrics from lm_eval.api.group import ConfigurableGroup
from lm_eval.api.metrics import (
aggregate_subtask_metrics,
pooled_sample_stderr,
stderr_for_metric,
)
from lm_eval.api.task import Task
from lm_eval.utils import eval_logger, positional_deprecated from lm_eval.utils import eval_logger, positional_deprecated
...@@ -98,7 +104,7 @@ class TaskOutput: ...@@ -98,7 +104,7 @@ class TaskOutput:
self.agg_metrics[metric_key] = agg_fn(items) self.agg_metrics[metric_key] = agg_fn(items)
self.sample_len = len(items) # TODO: same sample size for each metric? self.sample_len = len(items) # TODO: same sample size for each metric?
if isinstance(bootstrap_iters, int): if isinstance(bootstrap_iters, int):
stderr_fn = metrics.stderr_for_metric( stderr_fn = stderr_for_metric(
metric=agg_fn, metric=agg_fn,
bootstrap_iters=min(bootstrap_iters, 100) bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"] if metric in ["bleu", "chrf", "ter"]
...@@ -116,23 +122,71 @@ class TaskOutput: ...@@ -116,23 +122,71 @@ class TaskOutput:
return ( return (
f"TaskOutput(task_name={self.task_name}, " f"TaskOutput(task_name={self.task_name}, "
f"group_name={self.group_name}, " f"group_name={self.group_name}, "
f"version={self.version}," f"version={self.version}, "
f"n_shot={self.n_shot}" f"n_shot={self.n_shot}, "
f"task_alias={self.task_alias}, group_alias={self.group_alias})" f"task_alias={self.task_alias}, "
f"group_alias={self.group_alias})"
) )
def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]: def get_task_list(task_dict: dict) -> List[TaskOutput]:
task_hierarchy = collections.defaultdict(list) outputs = []
outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items()) for task_name, task_obj in task_dict.items():
for task_output in outputs: if isinstance(task_obj, dict):
if group_name := task_output.group_name: _outputs = get_task_list(task_obj)
task_hierarchy[group_name].append(task_output.task_name) outputs.extend(_outputs)
else: else:
task_hierarchy[task_output.task_name] = [] task_output = TaskOutput.from_taskdict(task_name, task_obj)
# returns task_hierarchy tracking which groups contain which subtasks, outputs.append(task_output)
# and a list of TaskOutput classes for each non-group subtask
return task_hierarchy, [x for x in outputs if x.task] return outputs
def get_subtask_list(task_dict, task_root=None, depth=0):
subtask_list = {}
for group_obj, task_obj in task_dict.items():
if isinstance(group_obj, ConfigurableGroup):
# group_name = group_obj.group_name
group_name = group_obj.group_name
else:
group_name = group_obj
if isinstance(task_obj, dict):
_subtask_list = get_subtask_list(
task_obj, task_root=group_name, depth=depth + 1
)
if task_root:
subtask_list.setdefault((task_root, depth), []).extend(
[
_task
for (_task, _depth) in _subtask_list.keys()
if (_depth - 1) == depth
]
)
subtask_list = {**subtask_list, **_subtask_list}
else:
if isinstance(task_obj, ConfigurableGroup):
# group_or_task_name = task_obj.group_name
group_or_task_name = task_obj.group_name
elif isinstance(task_obj, Task):
# group_or_task_name = task_obj.task_name
group_or_task_name = task_obj.task_name
if task_root is None:
subtask_list.setdefault((group_or_task_name, depth), [])
else:
subtask_list.setdefault((task_root, depth), []).append(
group_or_task_name
)
if depth == 0:
_subtask_list = {}
for group_key, task_list in subtask_list.items():
group_name, depth = group_key
_subtask_list[group_name] = task_list
subtask_list = _subtask_list
return subtask_list
def print_writeout(task) -> None: def print_writeout(task) -> None:
...@@ -155,70 +209,95 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]: ...@@ -155,70 +209,95 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
def prepare_print_tasks( def prepare_print_tasks(
task_hierarchy: dict, results: dict, tab=0 task_dict: dict,
results: dict,
task_depth=0,
group_depth=0,
) -> Tuple[dict, dict]: ) -> Tuple[dict, dict]:
""" """
@param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
value is a list of task names. value is a list of task names.
@param results: Dictionary containing the results of each task. Each key is a @param results: Dictionary containing the results of each task. Each key is a
group name and its value is a dictionary of task results. group name and its value is a dictionary of task results.
@param tab: The indentation level for printing the task @param task_depth: The indentation level for printing the task
hierarchy. Default is 0.
@param group_depth: The indentation level for printing the group
hierarchy. Default is 0. hierarchy. Default is 0.
@return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
aggregated results for each task, and groups_agg contains aggregated results for each group. aggregated results for each task, and groups_agg contains aggregated results for each group.
Prepares the task hierarchy and aggregates the results for each task and group recursively for printing. Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
""" """
results_agg = collections.defaultdict(dict)
groups_agg = collections.defaultdict(dict)
(group_name, task_list), *_ = task_hierarchy.items()
task_list = sorted(task_list)
results_agg[group_name] = results[group_name].copy()
# results_agg[group_name]["tab"] = tab
if "samples" in results_agg[group_name]:
results_agg[group_name].pop("samples")
tab_string = " " * tab + "- " if tab > 0 else ""
if "alias" in results_agg[group_name]: def _sort_task_dict(task_dict):
results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"] """
else: Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
results_agg[group_name]["alias"] = tab_string + group_name Required so that we end up sorting within each sub-header correctly.
"""
if len(task_list) > 0:
groups_agg[group_name] = results[group_name].copy() return dict(
# groups_agg[group_name]["tab"] = tab sorted(
if "samples" in groups_agg[group_name]: task_dict.items(),
groups_agg[group_name].pop("samples") key=lambda item: item[0].group_name
if isinstance(item[0], ConfigurableGroup)
if "alias" in groups_agg[group_name]: else item[0],
groups_agg[group_name]["alias"] = (
tab_string + groups_agg[group_name]["alias"]
) )
else: )
groups_agg[group_name]["alias"] = tab_string + group_name
for task_name in task_list: task_agg = collections.defaultdict(dict)
if task_name in task_hierarchy: group_agg = collections.defaultdict(dict)
_task_hierarchy = { task_dict = _sort_task_dict(task_dict)
**{task_name: task_hierarchy[task_name]}, for task_or_group_name, task_or_group_obj in task_dict.items():
**task_hierarchy, tab_string = " " * task_depth + "- " if task_depth > 0 else ""
} if isinstance(task_or_group_name, ConfigurableGroup):
# string_name = task_or_group_name.group_name
name = task_or_group_name.group_name
from_configurable_group = True
task_or_group_obj = _sort_task_dict(task_or_group_obj)
elif isinstance(task_or_group_name, str):
name = task_or_group_name
if isinstance(task_or_group_obj, Task):
# string_name = task_or_group_obj.task_name
name = task_or_group_obj.task_name
from_configurable_group = False
task_agg[name] = results[name].copy()
if from_configurable_group:
if task_or_group_name.group_alias is not None:
alias = task_or_group_name.group_alias
else: else:
_task_hierarchy = { alias = task_or_group_name.group
**{task_name: []}, else:
**task_hierarchy, if "alias" in task_agg[name]:
} alias = task_agg[name]["alias"]
else:
_results_agg, _groups_agg = prepare_print_tasks( alias = name
_task_hierarchy, results, tab + 1
task_agg[name]["alias"] = tab_string + alias
if "samples" in task_agg[name]:
task_agg[name].pop("samples")
if from_configurable_group and (" " not in results[name]):
group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
group_agg[name] = results[name].copy()
group_agg[name]["alias"] = group_tab_string + alias
if "samples" in group_agg[name]:
group_agg[name].pop("samples")
if isinstance(task_or_group_obj, dict):
task_depth += 1
group_depth += 1
_task_agg, _group_agg = prepare_print_tasks(
task_or_group_obj, results, task_depth, group_depth
) )
results_agg = {**results_agg, **_results_agg} task_agg = {
groups_agg = {**groups_agg, **_groups_agg} **task_agg,
**_task_agg,
return results_agg, groups_agg }
group_agg = {**group_agg, **_group_agg}
task_depth -= 1
group_depth -= 1
return task_agg, group_agg
def consolidate_results( def consolidate_results(
...@@ -261,6 +340,8 @@ def consolidate_results( ...@@ -261,6 +340,8 @@ def consolidate_results(
for task_output in eval_tasks: for task_output in eval_tasks:
if "task_alias" in (task_config := task_output.task_config): if "task_alias" in (task_config := task_output.task_config):
results[task_output.task_name]["alias"] = task_config["task_alias"] results[task_output.task_name]["alias"] = task_config["task_alias"]
else:
results[task_output.task_name]["alias"] = task_output.task_name
if group_alias := task_output.group_alias: if group_alias := task_output.group_alias:
if group_alias not in results and (group_name := task_output.group_name): if group_alias not in results and (group_name := task_output.group_name):
results[group_name]["alias"] = group_alias results[group_name]["alias"] = group_alias
...@@ -275,12 +356,153 @@ def consolidate_results( ...@@ -275,12 +356,153 @@ def consolidate_results(
metric_key metric_key
] ]
results[task_output.task_name]["samples"] = task_output.sample_len results[task_output.task_name]["samples"] = task_output.sample_len
results[task_output.task_name][ results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
f"{metric}_stderr,{filter_key}" task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"] )
return results, samples, configs, versions, num_fewshot, higher_is_better return results, samples, configs, versions, num_fewshot, higher_is_better
def consolidate_group_results(
results,
versions,
task_dict,
task_root=None,
show_group_table=False,
task_aggregation_list=None,
) -> Tuple[dict, dict, bool, Union[None,]]:
"""
(Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
@return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
- results: A defaultdict with task names (and, after this function is called, group names of
groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
- versions: A defaultdict with task names (and, after this function is called, group names of
groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
- show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
- task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
In the top-level invocation of this function, task_aggregation_list is ignored.
"""
if task_root is None:
task_root = {}
if task_aggregation_list is None:
task_aggregation_list = {}
for group_or_task, group_or_task_info in task_dict.items():
# Convert to string
if isinstance(group_or_task, ConfigurableGroup):
group_config = group_or_task.config
group_or_task = group_or_task.group_name
else:
group_config = None
if isinstance(group_or_task_info, Task):
if task_root:
task_aggregation_list.setdefault(task_root, []).append(
group_or_task_info.task_name
)
else:
(
results,
versions,
show_group_table,
_task_aggregation_list,
) = consolidate_group_results(
results,
versions,
group_or_task_info,
group_or_task,
show_group_table,
task_aggregation_list,
)
if task_root:
task_aggregation_list.setdefault(task_root, []).extend(
task_aggregation_list.get(group_or_task, [])
)
if (group_config is None) or (
group_config["aggregate_metric_list"] is None
):
results[group_or_task][" "] = " "
continue
if "aggregate_metric_list" in group_config:
agg_metric_list = group_config["aggregate_metric_list"]
show_group_table = show_group_table | bool(
group_config["aggregate_metric_list"]
)
task_list = _task_aggregation_list[group_or_task]
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["task", "alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
for metric_config in agg_metric_list:
for filter_name in metric_config["filter_list"]:
if metric != ",".join([metric_config["metric"], filter_name]):
continue
# compute group's pooled metric and stderr
if metric_config["aggregation"] == "mean":
aggregate_fn = aggregate_subtask_metrics
elif callable(metric_config["aggregation"]):
aggregate_fn = metric_config["aggregation"]
else:
raise ValueError(
f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
)
results[group_or_task][metric] = aggregate_fn(
metrics,
sizes,
metric_config["weight_by_size"],
)
# TODO: calculate groups' metrics using arbitrary agg fns
if "N/A" in stderrs:
results[group_or_task][stderr] = "N/A"
else:
# NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
results[group_or_task][stderr] = pooled_sample_stderr(
stderrs, sizes
)
results[group_or_task]["samples"] = sum(sizes)
group_metadata = group_config.get("metadata", None)
if group_metadata is not None:
versions[group_or_task] = group_metadata.get("version", None)
# print(results)
return results, versions, show_group_table, task_aggregation_list
@positional_deprecated @positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path: def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
""" """
......
...@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter): ...@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter):
def filter_set(inst): def filter_set(inst):
filtered_resp = [] filtered_resp = []
for resp in inst: for resp in inst:
if resp.startswith(" "): resp = resp.lstrip()
resp = resp[1:]
filtered_resp.append(resp) filtered_resp.append(resp)
return filtered_resp return filtered_resp
filtered_resps = [filter_set(resp) for resp in resps] filtered_resps = [filter_set(resp) for resp in resps]
......
import json import json
import os
import re import re
import time import time
from collections import defaultdict from collections import defaultdict
...@@ -14,6 +15,7 @@ from huggingface_hub import ( ...@@ -14,6 +15,7 @@ from huggingface_hub import (
HfApi, HfApi,
hf_hub_url, hf_hub_url,
) )
from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger, eval_logger,
...@@ -48,6 +50,7 @@ class GeneralConfigTracker: ...@@ -48,6 +50,7 @@ class GeneralConfigTracker:
model_name_sanitized: str = None model_name_sanitized: str = None
system_instruction: str = None system_instruction: str = None
system_instruction_sha: str = None system_instruction_sha: str = None
fewshot_as_multiturn: bool = None
chat_template: str = None chat_template: str = None
chat_template_sha: str = None chat_template_sha: str = None
start_time: float = None start_time: float = None
...@@ -80,6 +83,7 @@ class GeneralConfigTracker: ...@@ -80,6 +83,7 @@ class GeneralConfigTracker:
model_args: str, model_args: str,
system_instruction: str, system_instruction: str,
chat_template: str, chat_template: str,
fewshot_as_multiturn: bool,
) -> None: ) -> None:
"""Logs model parameters and job ID.""" """Logs model parameters and job ID."""
self.model_source = model_source self.model_source = model_source
...@@ -91,6 +95,7 @@ class GeneralConfigTracker: ...@@ -91,6 +95,7 @@ class GeneralConfigTracker:
) )
self.chat_template = chat_template self.chat_template = chat_template
self.chat_template_sha = hash_string(chat_template) if chat_template else None self.chat_template_sha = hash_string(chat_template) if chat_template else None
self.fewshot_as_multiturn = fewshot_as_multiturn
def log_end_time(self) -> None: def log_end_time(self) -> None:
"""Logs the end time of the evaluation and calculates the total evaluation time.""" """Logs the end time of the evaluation and calculates the total evaluation time."""
...@@ -109,12 +114,15 @@ class EvaluationTracker: ...@@ -109,12 +114,15 @@ class EvaluationTracker:
output_path: str = None, output_path: str = None,
hub_results_org: str = "", hub_results_org: str = "",
hub_repo_name: str = "", hub_repo_name: str = "",
details_repo_name: str = "",
results_repo_name: str = "",
push_results_to_hub: bool = False, push_results_to_hub: bool = False,
push_samples_to_hub: bool = False, push_samples_to_hub: bool = False,
public_repo: bool = False, public_repo: bool = False,
token: str = "", token: str = "",
leaderboard_url: str = "", leaderboard_url: str = "",
point_of_contact: str = "", point_of_contact: str = "",
gated: bool = False,
) -> None: ) -> None:
""" """
Creates all the necessary loggers for evaluation tracking. Creates all the necessary loggers for evaluation tracking.
...@@ -123,12 +131,15 @@ class EvaluationTracker: ...@@ -123,12 +131,15 @@ class EvaluationTracker:
output_path (str): Path to save the results. If not provided, the results won't be saved. output_path (str): Path to save the results. If not provided, the results won't be saved.
hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token. hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`. hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
push_results_to_hub (bool): Whether to push the results to the Hugging Face hub. push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub. push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
public_repo (bool): Whether to push the results to a public or private repository. public_repo (bool): Whether to push the results to a public or private repository.
token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`. token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card. leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
point_of_contact (str): Contact information on the Hugging Face hub dataset card. point_of_contact (str): Contact information on the Hugging Face hub dataset card.
gated (bool): Whether to gate the repository.
""" """
self.general_config_tracker = GeneralConfigTracker() self.general_config_tracker = GeneralConfigTracker()
...@@ -139,6 +150,7 @@ class EvaluationTracker: ...@@ -139,6 +150,7 @@ class EvaluationTracker:
self.leaderboard_url = leaderboard_url self.leaderboard_url = leaderboard_url
self.point_of_contact = point_of_contact self.point_of_contact = point_of_contact
self.api = HfApi(token=token) if token else None self.api = HfApi(token=token) if token else None
self.gated_repo = gated
if not self.api and (push_results_to_hub or push_samples_to_hub): if not self.api and (push_results_to_hub or push_samples_to_hub):
raise ValueError( raise ValueError(
...@@ -156,9 +168,24 @@ class EvaluationTracker: ...@@ -156,9 +168,24 @@ class EvaluationTracker:
f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'." f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
) )
hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results" if hub_repo_name == "":
self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}" details_repo_name = (
self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private" details_repo_name if details_repo_name != "" else "lm-eval-results"
)
results_repo_name = (
results_repo_name if results_repo_name != "" else details_repo_name
)
else:
details_repo_name = hub_repo_name
results_repo_name = hub_repo_name
eval_logger.warning(
"hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
)
self.details_repo = f"{hub_results_org}/{details_repo_name}"
self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
self.results_repo = f"{hub_results_org}/{results_repo_name}"
self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"
def save_results_aggregated( def save_results_aggregated(
self, self,
...@@ -208,9 +235,9 @@ class EvaluationTracker: ...@@ -208,9 +235,9 @@ class EvaluationTracker:
if self.api and self.push_results_to_hub: if self.api and self.push_results_to_hub:
repo_id = ( repo_id = (
self.hub_results_repo self.results_repo
if self.public_repo if self.public_repo
else self.hub_results_repo_private else self.results_repo_private
) )
self.api.create_repo( self.api.create_repo(
repo_id=repo_id, repo_id=repo_id,
...@@ -218,10 +245,15 @@ class EvaluationTracker: ...@@ -218,10 +245,15 @@ class EvaluationTracker:
private=not self.public_repo, private=not self.public_repo,
exist_ok=True, exist_ok=True,
) )
self.api.upload_folder( self.api.upload_file(
repo_id=repo_id, repo_id=repo_id,
folder_path=str(path), path_or_fileobj=str(
path_in_repo=self.general_config_tracker.model_name_sanitized, path.joinpath(f"results_{self.date_id}.json")
),
path_in_repo=os.path.join(
self.general_config_tracker.model_name,
f"results_{self.date_id}.json",
),
repo_type="dataset", repo_type="dataset",
commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}", commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
) )
...@@ -275,6 +307,7 @@ class EvaluationTracker: ...@@ -275,6 +307,7 @@ class EvaluationTracker:
sample["resps"] = sanitize_list(sample["resps"]) sample["resps"] = sanitize_list(sample["resps"])
sample["filtered_resps"] = sanitize_list(sample["filtered_resps"]) sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
sample["arguments"] = arguments sample["arguments"] = arguments
sample["target"] = str(sample["target"])
sample_dump = ( sample_dump = (
json.dumps( json.dumps(
...@@ -285,14 +318,14 @@ class EvaluationTracker: ...@@ -285,14 +318,14 @@ class EvaluationTracker:
+ "\n" + "\n"
) )
with open(file_results_samples, "a") as f: with open(file_results_samples, "a", encoding="utf-8") as f:
f.write(sample_dump) f.write(sample_dump)
if self.api and self.push_samples_to_hub: if self.api and self.push_samples_to_hub:
repo_id = ( repo_id = (
self.hub_results_repo self.details_repo
if self.public_repo if self.public_repo
else self.hub_results_repo_private else self.details_repo_private
) )
self.api.create_repo( self.api.create_repo(
repo_id=repo_id, repo_id=repo_id,
...@@ -300,6 +333,18 @@ class EvaluationTracker: ...@@ -300,6 +333,18 @@ class EvaluationTracker:
private=not self.public_repo, private=not self.public_repo,
exist_ok=True, exist_ok=True,
) )
try:
if self.gated_repo:
headers = build_hf_headers()
r = get_session().put(
url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
headers=headers,
json={"gated": "auto"},
)
hf_raise_for_status(r)
except Exception as e:
eval_logger.warning("Could not gate the repository")
eval_logger.info(repr(e))
self.api.upload_folder( self.api.upload_folder(
repo_id=repo_id, repo_id=repo_id,
folder_path=str(path), folder_path=str(path),
...@@ -324,9 +369,7 @@ class EvaluationTracker: ...@@ -324,9 +369,7 @@ class EvaluationTracker:
""" """
eval_logger.info("Recreating metadata card") eval_logger.info("Recreating metadata card")
repo_id = ( repo_id = self.details_repo if self.public_repo else self.details_repo_private
self.hub_results_repo if self.public_repo else self.hub_results_repo_private
)
files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset") files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
results_files = get_results_filenames(files_in_repo) results_files = get_results_filenames(files_in_repo)
...@@ -357,7 +400,10 @@ class EvaluationTracker: ...@@ -357,7 +400,10 @@ class EvaluationTracker:
results_datetime, results_datetime,
) )
latest_task_results_datetime[samples_key] = latest_datetime latest_task_results_datetime[samples_key] = latest_datetime
latest_task_results_datetime[results_key] = latest_datetime latest_task_results_datetime[results_key] = max(
latest_task_results_datetime[results_key],
latest_datetime,
)
# Create metadata card # Create metadata card
card_metadata = MetadataConfigs() card_metadata = MetadataConfigs()
...@@ -374,14 +420,15 @@ class EvaluationTracker: ...@@ -374,14 +420,15 @@ class EvaluationTracker:
sanitized_last_eval_date_results = re.sub( sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name] r"[^\w\.]", "_", latest_task_results_datetime[config_name]
) )
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results: if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append( card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]} {"split": "latest", "path": [str(results_filename)]}
) )
...@@ -400,65 +447,20 @@ class EvaluationTracker: ...@@ -400,65 +447,20 @@ class EvaluationTracker:
sanitized_last_eval_date_results = re.sub( sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name] r"[^\w\.]", "_", latest_task_results_datetime[config_name]
) )
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results: if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append( card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]} {"split": "latest", "path": [str(results_filename)]}
) )
# Special case for MMLU with a single split covering it all
# We add another config with all MMLU splits results together for easy inspection
SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
for special_task in SPECIAL_TASKS:
if special_task in config_name:
special_task = f"{model_name}__{special_task}"
former_entry = card_metadata.get(special_task, {"data_files": []})
former_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == eval_date_sanitized
]
if len(former_split) == 0:
former_entry["data_files"].append(
{
"split": eval_date_sanitized,
"path": [str(results_filename)],
}
)
else:
split_index, _ = former_split[0]
former_entry["data_files"][split_index]["path"].append(
str(results_filename)
)
if eval_date_sanitized == sanitized_last_eval_date_results:
latest_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == "latest"
]
if len(latest_split) == 0:
former_entry["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
else:
latest_index, _ = latest_split[0]
former_entry["data_files"][latest_index]["path"].append(
str(results_filename)
)
card_metadata[special_task] = former_entry
# Get latest results and extract info to update metadata card examples # Get latest results and extract info to update metadata card examples
latest_datetime = max(latest_task_results_datetime.values()) latest_datetime = max(latest_task_results_datetime.values())
latest_model_name = max( latest_model_name = max(
......
...@@ -110,3 +110,34 @@ def add_env_info(storage: Dict[str, Any]): ...@@ -110,3 +110,34 @@ def add_env_info(storage: Dict[str, Any]):
"upper_git_hash": upper_dir_commit, # in case this repo is submodule "upper_git_hash": upper_dir_commit, # in case this repo is submodule
} }
storage.update(added_info) storage.update(added_info)
def add_tokenizer_info(storage: Dict[str, Any], lm):
if getattr(lm, "tokenizer", False):
try:
tokenizer_info = {
"tokenizer_pad_token": [
lm.tokenizer.pad_token,
str(lm.tokenizer.pad_token_id),
],
"tokenizer_eos_token": [
lm.tokenizer.eos_token,
str(lm.tokenizer.eos_token_id),
],
"tokenizer_bos_token": [
lm.tokenizer.bos_token,
str(lm.tokenizer.bos_token_id),
],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
except Exception as err:
logger.debug(
f"Logging detailed tokenizer info failed with {err}, skipping..."
)
# seems gguf and textsynth do not have tokenizer
else:
logger.debug(
"LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
)
from . import ( from . import (
anthropic_llms, anthropic_llms,
api_models,
dummy, dummy,
gguf, gguf,
huggingface, huggingface,
......
from typing import Any, List, Tuple import os
from functools import cached_property
from typing import Any, Dict, List, Tuple, Union
from tqdm import tqdm from tqdm import tqdm
from lm_eval import utils from lm_eval import utils
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.openai_completions import LocalCompletionsAPI
from lm_eval.models.utils import retry_on_specific_exceptions from lm_eval.models.utils import retry_on_specific_exceptions
...@@ -138,7 +141,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install ...@@ -138,7 +141,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
return messages() return messages()
@register_model("anthropic") @register_model("anthropic-completions")
class AnthropicLM(LM): class AnthropicLM(LM):
REQ_CHUNK_SIZE = 20 # TODO: not used REQ_CHUNK_SIZE = 20 # TODO: not used
...@@ -271,90 +274,89 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install ...@@ -271,90 +274,89 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
@register_model("anthropic-chat", "anthropic-chat-completions") @register_model("anthropic-chat", "anthropic-chat-completions")
class AnthropicChatLM(AnthropicLM): class AnthropicChat(LocalCompletionsAPI):
REQ_CHUNK_SIZE = 20 # TODO: not used
def __init__( def __init__(
self, self,
model: str, base_url="https://api.anthropic.com/v1/messages",
batch_size: int = 1, tokenizer_backend=None,
max_tokens: int = 256, **kwargs,
temperature: float = 0, # defaults to 1 ):
**kwargs, # top_p, top_k, etc. super().__init__(
) -> None: base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
"""Anthropic API wrapper. )
eval_logger.warning(
:param model: str "Chat completions does not support batching. Defaulting to batch size 1."
Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229' )
:param max_tokens: int self._batch_size = 1
Maximum number of tokens to sample from the model self.anthropic_version = "2023-06-01"
:param temperature: float eval_logger.warning(
Sampling temperature f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
:param kwargs: Any )
Additional model_args to pass to the API client
"""
super().__init__()
try: @cached_property
import anthropic def api_key(self):
except ModuleNotFoundError: """Override this property to return the API key for the API request."""
raise Exception( key = os.environ.get("ANTHROPIC_API_KEY", None)
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ if key is None:
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", raise ValueError(
"API key not found. Please set the ANTHROPIC_API_KEY environment variable."
) )
return key
self.model = model
# defaults to os.environ.get("ANTHROPIC_API_KEY") @cached_property
self.client = anthropic.Anthropic() def header(self):
self.temperature = temperature return {
self.max_tokens = max_tokens "x-api-key": f"{self.api_key}",
self.tokenizer = self.client.get_tokenizer() "anthropic-version": self.anthropic_version,
self.kwargs = kwargs }
@property def _create_payload(
def max_gen_toks(self) -> int: self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs
return self.max_tokens ) -> dict:
system = (
def generate_until(self, requests) -> List[str]: messages[0].get("content") if messages[0].get("role") == "system" else None
try: )
import anthropic if system:
except ModuleNotFoundError: messages = messages[1:]
raise Exception( gen_kwargs.pop("do_sample", False)
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", temperature = gen_kwargs.pop("temperature", 0)
) stop = gen_kwargs.pop("until", ["\n\nHuman:"])
if not isinstance(stop, list):
if not requests: stop = [stop]
return [] out = {
"messages": messages,
_requests: List[Tuple[str, dict]] = [req.args for req in requests] "model": self.model,
"max_tokens": max_tokens,
"temperature": temperature,
"stop_sequences": stop,
**gen_kwargs,
}
if system:
out["system"] = system
return out
def parse_generations(
self, outputs: Union[Dict, List[Dict]], **kwargs
) -> List[str]:
res = [] res = []
for request in tqdm(_requests): if not isinstance(outputs, list):
try: outputs = [outputs]
inp = request[0] for out in outputs:
request_args = request[1] for choices in out["content"]:
# generation_kwargs res.append(choices["text"])
until = request_args.get("until")
max_tokens = request_args.get("max_gen_toks", self.max_length)
temperature = request_args.get("temperature", self.temperature)
response = anthropic_chat(
client=self.client,
model=self.model,
prompt=inp,
max_tokens=max_tokens,
temperature=temperature, # TODO: implement non-greedy sampling for Anthropic
stop=until, # type: ignore
**self.kwargs,
)
res.append(response)
self.cache_hook.add_partial("generate_until", request, response)
except anthropic.APIConnectionError as e: # type: ignore # noqa: F821
eval_logger.critical(f"Server unreachable: {e.__cause__}")
break
except anthropic.APIStatusError as e: # type: ignore # noqa: F821
eval_logger.critical(f"API error {e.status_code}: {e.message}")
break
return res return res
def tok_encode(
self,
string: str,
left_truncate_len=None,
add_special_tokens=None,
**kwargs,
) -> List[str]:
return [string]
def loglikelihood(self, requests, **kwargs):
raise NotImplementedError(
"Anthropic Chat Completions API does not support the return of loglikelihood"
)
This diff is collapsed.
This diff is collapsed.
...@@ -231,6 +231,7 @@ class NEURON_HF(TemplateLM): ...@@ -231,6 +231,7 @@ class NEURON_HF(TemplateLM):
" For inf2.48xlarge, set it to `24`." " For inf2.48xlarge, set it to `24`."
) )
revision = str(revision) # cast to string if not already one
# TODO: update this to be less of a hack once subfolder is fixed in HF # TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "") revision = revision + ("/" + subfolder if subfolder is not None else "")
...@@ -288,7 +289,7 @@ class NEURON_HF(TemplateLM): ...@@ -288,7 +289,7 @@ class NEURON_HF(TemplateLM):
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = self.add_bos_token self.add_bos_token = add_bos_token
self._max_length = max_length self._max_length = max_length
......
This diff is collapsed.
...@@ -5,6 +5,7 @@ import itertools ...@@ -5,6 +5,7 @@ import itertools
import time import time
from functools import wraps from functools import wraps
from typing import ( from typing import (
TYPE_CHECKING,
Any, Any,
Callable, Callable,
Dict, Dict,
...@@ -24,6 +25,11 @@ import transformers ...@@ -24,6 +25,11 @@ import transformers
from lm_eval.utils import eval_logger from lm_eval.utils import eval_logger
if TYPE_CHECKING:
from transformers import PreTrainedTokenizerBase
from transformers.configuration_utils import PretrainedConfig
def chunks(iter, n: int = 0, fn=None): def chunks(iter, n: int = 0, fn=None):
""" """
Divides an iterable into chunks of specified size or based on a given function. Divides an iterable into chunks of specified size or based on a given function.
...@@ -613,3 +619,48 @@ class Collator: ...@@ -613,3 +619,48 @@ class Collator:
if arr: if arr:
yield arr yield arr
def configure_pad_token(
tokenizer: "PreTrainedTokenizerBase",
model_config: Optional["PretrainedConfig"] = None,
) -> "PreTrainedTokenizerBase":
"""
This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
Some tokenizers require special handling.
Args:
tokenizer: The tokenizer for which the padding token is to be handled.
model_config: The configuration of the model. Default is None.
Returns:
The tokenizer after the padding token has been handled.
Raises:
AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
"""
if tokenizer.pad_token:
pass
elif tokenizer.unk_token:
tokenizer.pad_token_id = tokenizer.unk_token_id
elif tokenizer.eos_token:
tokenizer.pad_token_id = tokenizer.eos_token_id
else:
# handle special cases
if model_config and getattr(model_config, "model_type", None) == "qwen":
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
tokenizer.pad_token = "<|endoftext|>"
elif (
tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
):
# The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
# The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
# ---
# Note that the world tokenizer class name, might change in the future for the final huggingface merge
# https://github.com/huggingface/transformers/pull/26963
assert tokenizer.pad_token_id == 0
else:
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
return tokenizer
import copy import copy
from importlib.metadata import version from importlib.metadata import version
from importlib.util import find_spec from importlib.util import find_spec
from typing import List, Literal, Optional, Tuple, Union from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
from more_itertools import distribute from more_itertools import distribute
from packaging.version import parse as parse_version from packaging.version import parse as parse_version
...@@ -10,7 +10,7 @@ from tqdm import tqdm ...@@ -10,7 +10,7 @@ from tqdm import tqdm
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator, undistribute from lm_eval.models.utils import Collator, configure_pad_token, undistribute
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger, eval_logger,
get_rolling_token_windows, get_rolling_token_windows,
...@@ -26,6 +26,8 @@ try: ...@@ -26,6 +26,8 @@ try:
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
if TYPE_CHECKING:
pass
eval_logger = eval_logger eval_logger = eval_logger
...@@ -118,11 +120,12 @@ class VLLM(TemplateLM): ...@@ -118,11 +120,12 @@ class VLLM(TemplateLM):
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
tokenizer_revision=tokenizer_revision, tokenizer_revision=tokenizer_revision,
) )
self.tokenizer = configure_pad_token(self.tokenizer)
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
if "gemma" in pretrained.lower(): if "gemma" in pretrained.lower():
self.add_bos_token = True self.add_bos_token = True
eval_logger.info( eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it." "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
) )
self.custom_prefix_token_id = prefix_token_id self.custom_prefix_token_id = prefix_token_id
...@@ -176,23 +179,46 @@ class VLLM(TemplateLM): ...@@ -176,23 +179,46 @@ class VLLM(TemplateLM):
def max_gen_toks(self): def max_gen_toks(self):
return self._max_gen_toks return self._max_gen_toks
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
"""
Method to apply a chat template to a list of chat history between user and model.
"""
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
@property
def chat_template(self) -> str:
if self.tokenizer.chat_template is not None:
return self.tokenizer.chat_template
return self.tokenizer.default_chat_template
@property
def tokenizer_name(self) -> str:
return self.tokenizer.name_or_path.replace("/", "__")
def tok_encode( def tok_encode(
self, self,
string: str, string: Union[str, List[str]],
left_truncate_len=None, left_truncate_len: int = None,
add_special_tokens=None, add_special_tokens: bool = False,
truncation=False, truncation: bool = False,
): ) -> Union[List[int], List[List[int]]]:
""" """
if not add_special_tokens: if not add_special_tokens:
add_special_tokens = False or self.add_bos_token add_special_tokens = False or self.add_bos_token
encoding = self.tokenizer.encode( encoding: Union[List[List[int]], List[int]] = self.tokenizer(
string, add_special_tokens=add_special_tokens, truncation=truncation string,
) add_special_tokens=add_special_tokens,
truncation=truncation,
return_attention_mask=False,
).input_ids
# left-truncate the encoded context to be at most `left_truncate_len` tokens long # left-truncate the encoded context to be at most `left_truncate_len` tokens long
if left_truncate_len: if left_truncate_len:
encoding = encoding[-left_truncate_len:] if not isinstance(string, str):
encoding = [enc[-left_truncate_len:] for enc in encoding]
else:
encoding = encoding[-left_truncate_len:]
return encoding return encoding
...@@ -209,7 +235,7 @@ class VLLM(TemplateLM): ...@@ -209,7 +235,7 @@ class VLLM(TemplateLM):
sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs) sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
else: else:
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, prompt_logprobs=1, max_tokens=1 temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
) )
if self.data_parallel_size > 1: if self.data_parallel_size > 1:
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
...@@ -290,7 +316,9 @@ class VLLM(TemplateLM): ...@@ -290,7 +316,9 @@ class VLLM(TemplateLM):
# batch tokenize contexts # batch tokenize contexts
context, all_gen_kwargs = zip(*(req.args for req in requests)) context, all_gen_kwargs = zip(*(req.args for req in requests))
context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids context_encoding: List[List[int]] = self.tok_encode(
context, add_special_tokens=self.add_bos_token
)
requests = [ requests = [
((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs) ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
] ]
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic | | [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese | | [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English | | [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English | | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English |
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English | | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English | | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
...@@ -19,11 +20,13 @@ ...@@ -19,11 +20,13 @@
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German | | [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) | | [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | | | benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple | | [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English | | [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese | | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese | | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby | | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
| [commonsense_qa](commonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian | | [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English | | [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French | | [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
...@@ -46,6 +49,7 @@ ...@@ -46,6 +49,7 @@
| [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English | | [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English | | [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English | | [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
| [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean | | [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
| [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean | | [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
| [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean | | [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
...@@ -53,23 +57,28 @@ ...@@ -53,23 +57,28 @@
| [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English | | [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian | | [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese | | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
| [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual |
| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese | | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese | | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English | | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
| [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English | | [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
| [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English |
| medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English | | medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
| medqa | Multiple choice question answering based on the United States Medical License Exams. | | | medqa | Multiple choice question answering based on the United States Medical License Exams. | |
| [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu | | [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
| [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English | | [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
| mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English | | mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigourous. | English |
| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | | | model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
| [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English | | [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
| [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English | | [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
| [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** | | [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) | | [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) **Machine Translated.** |
| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) | | okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) **Machine Translated.** |
| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) | | [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English | | [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean | | [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English | | [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English | | [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
...@@ -105,7 +114,7 @@ ...@@ -105,7 +114,7 @@
| [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish | | [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish |
| [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English | | [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English |
| [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese | | [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese |
| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greekm English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese | | [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
| [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque | | [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
| [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese | | [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
| [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese | | [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
This diff is collapsed.
...@@ -26,7 +26,7 @@ Homepage: https://github.com/isen-zhang/ACLUE ...@@ -26,7 +26,7 @@ Homepage: https://github.com/isen-zhang/ACLUE
} }
``` ```
### Groups and Tasks ### Groups, Tags, and Tasks
#### Groups #### Groups
......
group: aclue
task:
- aclue_ancient_chinese_culture
- aclue_ancient_literature
- aclue_ancient_medical
- aclue_ancient_phonetics
- aclue_basic_ancient_chinese
- aclue_couplet_prediction
- aclue_homographic_character_resolution
- aclue_named_entity_recognition
- aclue_poetry_appreciate
- aclue_poetry_context_prediction
- aclue_poetry_quality_assessment
- aclue_poetry_sentiment_analysis
- aclue_polysemy_resolution
- aclue_reading_comprehension
- aclue_sentence_segmentation
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
group: aclue
dataset_path: tyouisen/aclue dataset_path: tyouisen/aclue
test_split: test test_split: test
fewshot_split: dev fewshot_split: dev
...@@ -16,4 +15,4 @@ metric_list: ...@@ -16,4 +15,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -24,11 +24,11 @@ Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomInt ...@@ -24,11 +24,11 @@ Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomInt
### Citation ### Citation
### Groups and Tasks ### Groups, Tags, and Tasks
#### Groups #### Groups
- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social. - `aexams`: Arabic EXAMS dataset, including IslamicStudies, Biology, Science, Physics, Social subjects.
#### Tasks #### Tasks
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment