Commit d859d1ca authored by Nathan Habib's avatar Nathan Habib
Browse files

batch commit

parent 6e49b1f6
......@@ -102,10 +102,12 @@ results = lm_eval.simple_evaluate( # call simple_evaluate
)
```
See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
As a brief example usage of `evaluate()`:
```python
......@@ -145,7 +147,7 @@ task_dict = lm_eval.tasks.get_task_dict(
task_manager # A task manager that allows lm_eval to
# load the task during evaluation.
# If none is provided, `get_task_dict`
# will instantiate one itself, but this
# will instantiated one itself, but this
# only includes the stock tasks so users
# will need to set this if including
# custom paths is required.
......
......@@ -5,6 +5,7 @@ import os
import sys
from functools import partial
from typing import Union
from accelerate import Accelerator
from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict
......@@ -292,13 +293,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"If fewshot_as_multiturn is set, apply_chat_template must be set to True."
)
if (
args.num_fewshot is None or args.num_fewshot == 0
) and args.fewshot_as_multiturn:
raise ValueError(
"If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
)
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
......@@ -354,17 +348,11 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
# Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
if args.trust_remote_code:
eval_logger.info(
"Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
args.model_args = (
args.model_args
+ f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"
)
# HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
# because it's already been determined based on the prior env var before launching our
# script--`datasets` gets imported by lm_eval internally before these lines can update the env.
import datasets
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
args.model_args = args.model_args + ",trust_remote_code=True"
eval_logger.info(f"Selected Tasks: {task_names}")
......@@ -400,7 +388,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
**request_caching_args,
)
if results is not None:
accelerator = Accelerator()
if results is not None and accelerator.is_main_process:
if args.log_samples:
samples = results.pop("samples")
dumped = json.dumps(
......
......@@ -67,9 +67,9 @@ class TaskConfig(dict):
training_split: Optional[str] = None
validation_split: Optional[str] = None
test_split: Optional[str] = None
fewshot_split: Optional[str] = (
None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
)
fewshot_split: Optional[
str
] = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
# formatting / prompting options.
# see docs/advanced_task_guide.md for more info
process_docs: Optional[Callable] = None
......@@ -92,9 +92,9 @@ class TaskConfig(dict):
filter_list: Optional[Union[str, list]] = None
should_decontaminate: bool = False
doc_to_decontamination_query: Optional[str] = None
metadata: Optional[dict] = (
None # by default, not used in the code. allows for users to pass arbitrary info to tasks
)
metadata: Optional[
dict
] = None # by default, not used in the code. allows for users to pass arbitrary info to tasks
def __post_init__(self) -> None:
if self.generation_kwargs is not None:
......@@ -229,9 +229,9 @@ class Task(abc.ABC):
self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()
self._filters = [build_filter_ensemble("none", [["take_first", None]])]
self.fewshot_rnd: Optional[random.Random] = (
None # purposely induce errors in case of improper usage
)
self.fewshot_rnd: Optional[
random.Random
] = None # purposely induce errors in case of improper usage
def download(
self,
......@@ -368,16 +368,15 @@ class Task(abc.ABC):
def build_all_requests(
self,
*,
limit: Union[int, None] = None,
rank: int = 0,
world_size: int = 1,
cache_requests: bool = False,
rewrite_requests_cache: bool = False,
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None,
tokenizer_name: str = "",
limit=None,
rank=None,
world_size=None,
cache_requests=False,
rewrite_requests_cache=False,
system_instruction=None,
apply_chat_template=False,
fewshot_as_multiturn=False,
lm=None,
) -> None:
"""Build a set of Instances for a task, and store them in task.instances"""
......@@ -392,7 +391,7 @@ class Task(abc.ABC):
if system_instruction is not None
else ""
)
cache_key += f"-tokenizer{tokenizer_name}"
cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else ""
cached_instances = load_from_cache(file_name=cache_key)
......@@ -437,7 +436,7 @@ class Task(abc.ABC):
system_instruction,
apply_chat_template,
fewshot_as_multiturn,
chat_template,
lm,
)
# TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
......@@ -445,6 +444,7 @@ class Task(abc.ABC):
doc=doc,
ctx=fewshot_ctx,
metadata=(self.config["task"], doc_id, self.config.repeats),
apply_chat_template=apply_chat_template
)
if not isinstance(inst, list):
......@@ -987,6 +987,28 @@ class ConfigurableTask(Task):
return super().fewshot_docs()
@staticmethod
def append_target_question(
labeled_examples: List[Dict[str, str]],
question: str,
fewshot_as_multiturn: bool = False,
) -> None:
"""Adds a target question to the labeled examples list.
If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
"""
if not fewshot_as_multiturn:
# if no messages or last message is system, append as new user entry
if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
labeled_examples.append({"role": "user", "content": question})
# if last message is user, append to it to avoid two user messages in a row
else:
labeled_examples[-1]["content"] += question
else:
return self.sampler.fewshot_delimiter + "".join(
f"{s['role']}: {s['content']}" + self.sampler.fewshot_delimiter
for s in chat_history
)
@staticmethod
def append_target_question(
labeled_examples: List[Dict[str, str]],
question: str,
......@@ -1015,7 +1037,7 @@ class ConfigurableTask(Task):
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None,
lm=None,
) -> str:
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
......@@ -1028,10 +1050,12 @@ class ConfigurableTask(Task):
System instruction to be applied to the prompt.
:param apply_chat_template: bool
Whether to apply the chat template to the fewshot context.
:param tokenizer:
The tokenizer to use for applying the chat template.
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param chat_template: Callable
Chat template to be applied to the fewshot context.
:param lm:
Language model with definition of the tokenizer/function to use for applying the chat template.
:returns: str
The fewshot context.
"""
......@@ -1078,7 +1102,7 @@ class ConfigurableTask(Task):
example = self.doc_to_text(doc)
if apply_chat_template:
if self.multiple_input:
return chat_template(labeled_examples)
return lm.apply_chat_template(labeled_examples)
if isinstance(example, str):
self.append_target_question(
labeled_examples, example, fewshot_as_multiturn
......@@ -1090,7 +1114,7 @@ class ConfigurableTask(Task):
for ex in example:
chat = deepcopy(labeled_examples)
self.append_target_question(chat, ex, fewshot_as_multiturn)
labeled_examples_list.append(chat_template(chat))
labeled_examples_list.append(lm.apply_chat_template(chat))
return labeled_examples_list
# if example is an integer, append the choice or convert to string
elif isinstance(example, int):
......@@ -1104,7 +1128,7 @@ class ConfigurableTask(Task):
labeled_examples, str(example), fewshot_as_multiturn
)
# return lm.apply_chat_template(labeled_examples)
return chat_template(labeled_examples)
return lm.apply_chat_template(labeled_examples)
else:
if self.multiple_input:
return labeled_examples
......@@ -1271,6 +1295,8 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "multiple_choice":
choices = self.doc_to_choice(doc)
target_delimiter = self.config.target_delimiter
if kwargs.get("apply_chat_template", False) is True:
target_delimiter = ""
if self.multiple_input:
# If there are multiple inputs, choices are placed in the ctx
cont = self.doc_to_target(doc)
......@@ -1280,6 +1306,7 @@ class ConfigurableTask(Task):
else:
# Otherwise they are placed in the continuation
arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
kwargs.pop("apply_chat_template")
request_list = [
Instance(
......@@ -1316,6 +1343,7 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, deepcopy(self.config.generation_kwargs))
kwargs.pop("apply_chat_template")
return Instance(
request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
)
......
......@@ -22,7 +22,7 @@ from lm_eval.evaluator_utils import (
run_task_tests,
)
from lm_eval.loggers import EvaluationTracker
from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
from lm_eval.loggers.utils import add_env_info, get_git_commit_hash
from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.utils import (
eval_logger,
......@@ -271,7 +271,6 @@ def simple_evaluate(
model_args=model_args,
system_instruction=system_instruction,
chat_template=lm.chat_template if apply_chat_template else None,
fewshot_as_multiturn=fewshot_as_multiturn,
)
results = evaluate(
......@@ -326,7 +325,6 @@ def simple_evaluate(
results["git_hash"] = get_git_commit_hash()
results["date"] = start_date
add_env_info(results) # additional environment info to results
add_tokenizer_info(results, lm) # additional info about tokenizer
return results
else:
return None
......@@ -399,12 +397,7 @@ def evaluate(
system_instruction=system_instruction,
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
chat_template=getattr(lm, "apply_chat_template")
if apply_chat_template
else None,
tokenizer_name=getattr(lm, "tokenizer_name", "")
if apply_chat_template
else "",
lm=lm,
)
eval_logger.debug(
f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
......@@ -614,16 +607,16 @@ def evaluate(
]
# compute group's pooled metric and stderr
results[group][metric] = (
lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
)
results[group][
metric
] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
)
results[group][
stderr
] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
......
......@@ -275,9 +275,9 @@ def consolidate_results(
metric_key
]
results[task_output.task_name]["samples"] = task_output.sample_len
results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
)
results[task_output.task_name][
f"{metric}_stderr,{filter_key}"
] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
return results, samples, configs, versions, num_fewshot, higher_is_better
......
......@@ -4,6 +4,7 @@ from lm_eval.api.registry import register_filter
@register_filter("decontaminate")
class DecontaminationFilter(Filter):
"""
A filter which evaluates
"""
......
......@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter):
def filter_set(inst):
filtered_resp = []
for resp in inst:
if resp.startswith(" "):
resp = resp[1:]
resp = resp.lstrip()
filtered_resp.append(resp)
return filtered_resp
filtered_resps = [filter_set(resp) for resp in resps]
......
import json
import os
import re
import time
from collections import defaultdict
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
from datasets import load_dataset
from datasets.utils.metadata import MetadataConfigs
......@@ -17,15 +19,9 @@ from huggingface_hub import (
from lm_eval.utils import (
eval_logger,
get_file_datetime,
get_file_task_name,
get_results_filenames,
get_sample_results_filenames,
handle_non_serializable,
hash_string,
sanitize_list,
sanitize_model_name,
sanitize_task_name,
)
......@@ -48,7 +44,6 @@ class GeneralConfigTracker:
model_name_sanitized: str = None
system_instruction: str = None
system_instruction_sha: str = None
fewshot_as_multiturn: bool = None
chat_template: str = None
chat_template_sha: str = None
start_time: float = None
......@@ -81,19 +76,24 @@ class GeneralConfigTracker:
model_args: str,
system_instruction: str,
chat_template: str,
fewshot_as_multiturn: bool,
) -> None:
"""Logs model parameters and job ID."""
self.model_source = model_source
self.model_name = GeneralConfigTracker._get_model_name(model_args)
self.model_name_sanitized = sanitize_model_name(self.model_name)
self.model_name_sanitized = re.sub(
r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
)
self.system_instruction = system_instruction
self.system_instruction_sha = (
hash_string(system_instruction) if system_instruction else None
)
self.chat_template = chat_template
self.chat_template_sha = hash_string(chat_template) if chat_template else None
self.fewshot_as_multiturn = fewshot_as_multiturn
self.chat_template_sha = None
if chat_template:
if not isinstance(chat_template, str):
self.chat_template_sha = hash_string(str(chat_template))
else:
self.chat_template_sha = hash_string(chat_template)
def log_end_time(self) -> None:
"""Logs the end time of the evaluation and calculates the total evaluation time."""
......@@ -210,21 +210,17 @@ class EvaluationTracker:
file_results_aggregated.open("w", encoding="utf-8").write(dumped)
if self.api and self.push_results_to_hub:
repo_id = (
self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private
)
repo_id = "open-llm-leaderboard/results_v2"
self.api.create_repo(
repo_id=repo_id,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
self.api.upload_file(
repo_id=repo_id,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
path_or_fileobj=str(path.joinpath(f"results_{self.date_id}.json")),
path_in_repo=os.path.join(self.general_config_tracker.model_name, f"results_{self.date_id}.json"),
repo_type="dataset",
commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
)
......@@ -262,7 +258,7 @@ class EvaluationTracker:
path.mkdir(parents=True, exist_ok=True)
file_results_samples = path.joinpath(
f"samples_{task_name}_{self.date_id}.jsonl"
f"samples_{task_name}_{self.date_id}.json"
)
for sample in samples:
......@@ -278,6 +274,7 @@ class EvaluationTracker:
sample["resps"] = sanitize_list(sample["resps"])
sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
sample["arguments"] = arguments
sample["target"] = str(sample["target"])
sample_dump = (
json.dumps(
......@@ -303,6 +300,13 @@ class EvaluationTracker:
private=not self.public_repo,
exist_ok=True,
)
headers = build_hf_headers()
r = get_session().put(
url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
headers=headers,
json={"gated": "auto"},
)
hf_raise_for_status(r)
self.api.upload_folder(
repo_id=repo_id,
folder_path=str(path),
......@@ -326,14 +330,23 @@ class EvaluationTracker:
Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
"""
def get_file_task_name(filename: str) -> str:
return filename[filename.find("_") + 1 : filename.rfind("_")]
def get_file_datetime(filename: str) -> str:
return filename[filename.rfind("_") + 1 :].replace(".json", "")
def sanitize_task_name(task_name: str) -> str:
return re.sub(r"\W", "_", task_name)
eval_logger.info("Recreating metadata card")
repo_id = (
self.hub_results_repo if self.public_repo else self.hub_results_repo_private
)
files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
results_files = get_results_filenames(files_in_repo)
sample_files = get_sample_results_filenames(files_in_repo)
results_files = [f for f in files_in_repo if "/results_" in f and ".json" in f]
sample_files = [f for f in files_in_repo if "/samples_" in f and ".json" in f]
# Build a dictionary to store the latest evaluation datetime for:
# - Each tested model and its aggregated results
......@@ -360,7 +373,10 @@ class EvaluationTracker:
results_datetime,
)
latest_task_results_datetime[samples_key] = latest_datetime
latest_task_results_datetime[results_key] = latest_datetime
latest_task_results_datetime[results_key] = max(
latest_task_results_datetime[results_key],
latest_datetime,
)
# Create metadata card
card_metadata = MetadataConfigs()
......@@ -377,14 +393,15 @@ class EvaluationTracker:
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
......@@ -403,64 +420,65 @@ class EvaluationTracker:
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results:
print(f"adding {config_name} for {eval_date_sanitized}")
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
# Special case for MMLU with a single split covering it all
# We add another config with all MMLU splits results together for easy inspection
SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
for special_task in SPECIAL_TASKS:
if special_task in config_name:
special_task = f"{model_name}__{special_task}"
former_entry = card_metadata.get(special_task, {"data_files": []})
former_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == eval_date_sanitized
]
if len(former_split) == 0:
former_entry["data_files"].append(
{
"split": eval_date_sanitized,
"path": [str(results_filename)],
}
)
else:
split_index, _ = former_split[0]
former_entry["data_files"][split_index]["path"].append(
str(results_filename)
)
# Special case for MMLU with a single split covering it all
# We add another config with all MMLU splits results together for easy inspection
SPECIAL_TASKS = ["leaderboard_gpqa", "leaderboard_math", "leaderboard_bbh", "leaderboard_musr"]
for special_task in SPECIAL_TASKS:
if special_task in config_name:
special_task = f"{model_name}__{special_task}"
former_entry = card_metadata.get(special_task, {"data_files": []})
if eval_date_sanitized == sanitized_last_eval_date_results:
latest_split = [
former_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == "latest"
if entry.get("split", None) == eval_date_sanitized
]
if len(latest_split) == 0:
if len(former_split) == 0:
former_entry["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
{
"split": eval_date_sanitized,
"path": [str(results_filename)],
}
)
else:
latest_index, _ = latest_split[0]
former_entry["data_files"][latest_index]["path"].append(
split_index, _ = former_split[0]
former_entry["data_files"][split_index]["path"].append(
str(results_filename)
)
card_metadata[special_task] = former_entry
if eval_date_sanitized == sanitized_last_eval_date_results:
latest_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == "latest"
]
if len(latest_split) == 0:
former_entry["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
else:
latest_index, _ = latest_split[0]
former_entry["data_files"][latest_index]["path"].append(
str(results_filename)
)
card_metadata[special_task] = former_entry
# Get latest results and extract info to update metadata card examples
latest_datetime = max(latest_task_results_datetime.values())
......
......@@ -110,20 +110,3 @@ def add_env_info(storage: Dict[str, Any]):
"upper_git_hash": upper_dir_commit, # in case this repo is submodule
}
storage.update(added_info)
def add_tokenizer_info(storage: Dict[str, Any], lm):
if getattr(lm, "tokenizer", False):
tokenizer_info = {
"tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
"tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
"tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
# seems gguf and textsynth do not have tokenizer
else:
logger.debug(
"LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
)
......@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
# defaults to os.environ.get("ANTHROPIC_API_KEY")
self.client = anthropic.Anthropic()
self.temperature = temperature
self.max_tokens = max_tokens
self.max_token = max_tokens
self.tokenizer = self.client.get_tokenizer()
self.kwargs = kwargs
......
......@@ -3,6 +3,7 @@ import os
from datetime import timedelta
from pathlib import Path
from typing import Dict, List, Literal, Optional, Tuple, Union
import jinja2
import torch
import torch.nn.functional as F
......@@ -13,6 +14,7 @@ from accelerate import (
InitProcessGroupKwargs,
find_executable_batch_size,
)
from accelerate.utils import get_max_memory
from huggingface_hub import HfApi
from packaging import version
from peft import PeftModel
......@@ -39,31 +41,6 @@ from lm_eval.models.utils import (
eval_logger = utils.eval_logger
def _get_accelerate_args(
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
gpus: Optional[int] = None,
) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
max_memory = {}
if max_memory_per_gpu is not None:
max_memory_per_gpu_map = {
device_idx: max_memory_per_gpu for device_idx in range(gpus)
}
max_memory.update(max_memory_per_gpu_map)
if max_cpu_memory is not None:
max_memory["cpu"] = max_cpu_memory
args = {}
if max_memory:
args["max_memory"] = max_memory
args["device_map"] = device_map_option
args["offload_folder"] = offload_folder
return args
@register_model("hf-auto", "hf", "huggingface")
class HFLM(TemplateLM):
"""
......@@ -104,7 +81,6 @@ class HFLM(TemplateLM):
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
parallelize: Optional[bool] = False,
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
......@@ -127,21 +103,6 @@ class HFLM(TemplateLM):
self._config = self._model.config
gpus = 0
if tokenizer:
assert isinstance(
tokenizer, transformers.PreTrainedTokenizer
) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
self.tokenizer = tokenizer
else:
# Get tokenizer
model_name = self._model.name_or_path
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_name,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast=use_fast_tokenizer,
)
else:
assert isinstance(device, str)
assert isinstance(pretrained, str)
......@@ -150,8 +111,7 @@ class HFLM(TemplateLM):
gpus = torch.cuda.device_count()
accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
if accelerator.num_processes > 1:
self.accelerator = accelerator
self.accelerator = accelerator
if "npu" in accelerator.device.type:
gpus = torch.npu.device_count()
......@@ -181,13 +141,13 @@ class HFLM(TemplateLM):
if torch.cuda.is_available()
else torch.device("cpu")
)
else:
else: # Parallelism managed by accelerate
if device != "cuda":
eval_logger.info(
f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
)
# TODO: include in warning that `load_in_8bit` etc. affect this too
self._device = torch.device(device)
self._device = self.accelerator.device if self.accelerator is not None else torch.device(device)
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
......@@ -195,7 +155,7 @@ class HFLM(TemplateLM):
self._get_config(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
trust_remote_code=trust_remote_code,
)
# determine which of 'causal' and 'seq2seq' backends to use
......@@ -221,7 +181,6 @@ class HFLM(TemplateLM):
trust_remote_code=trust_remote_code,
parallelize=parallelize,
gpus=gpus,
device_map_option=device_map_option,
max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory,
offload_folder=offload_folder,
......@@ -236,19 +195,6 @@ class HFLM(TemplateLM):
self.model.eval()
self.model.tie_weights()
if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
# TODO: can remove this whole snippet except in the mps case, perhaps?
if not (parallelize or autogptq or hasattr(self, "accelerator")):
# place model onto device requested manually,
# if not using HF Accelerate or device_map
# or any other option that preloads model onto device
try:
self.model.to(self.device)
except ValueError:
eval_logger.debug(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
)
self.truncation = truncation
self.logits_cache = logits_cache
self.vocab_size = self.tokenizer.vocab_size
......@@ -301,15 +247,24 @@ class HFLM(TemplateLM):
self.batch_size_per_gpu = int(batch_size)
if isinstance(pretrained, str):
if (gpus >= 1 or str(self.device) == "mps"):
# TODO: can remove this whole snippet except in the mps case, perhaps?
if not (parallelize or autogptq or hasattr(self, "accelerator")):
# place model onto device requested manually,
# if not using HF Accelerate or device_map
# or any other option that preloads model onto device
try:
self.model.to(self.device)
except ValueError:
eval_logger.debug(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
)
# multigpu data-parallel support when launched with accelerate
if gpus > 1:
if parallelize:
if accelerator.num_processes > 1:
raise RuntimeError(
"Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
)
else:
pass
if parallelize and accelerator.num_processes > 1:
eval_logger.warning(
"You are both using a HF Accelerate `device_map` and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
)
elif accelerator.num_processes == 1:
# if we aren't launching via accelerate, ditch
self._rank = 0
......@@ -358,6 +313,77 @@ class HFLM(TemplateLM):
f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
)
def _get_accelerate_args(
self,
parallelize: bool = None,
device_map: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
gpus: Optional[int] = None,
) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
num_machines = int(os.environ.get("WORLD_SIZE", 0)) // num_local_processes
if num_machines == 0:
eval_logger.info("We are not in a distributed setting. Setting model_parallel to False.")
parallelize = False
if parallelize is None:
# If parallelism is unset by the user, we automatically assign model parallelism
# if enough extra GPUs are available
max_memory_all_gpus = get_max_memory()
# We just want gpu, not cpu, max memory
if "cpu" in max_memory_all_gpus:
del max_memory_all_gpus["cpu"]
model_parallel = bool(num_local_processes < len(max_memory_all_gpus))
eval_logger.info(
f"Setting model parallel to {model_parallel} since "
f"the number of local processes is {num_local_processes} "
f"and the number of GPUs is {len(max_memory_all_gpus)}"
)
args = {}
if parallelize: # Model parallelism will be used
max_memory = {}
if max_memory_per_gpu is not None: # Using the provided memory requirements
max_memory_per_gpu_map = {device_idx: max_memory_per_gpu for device_idx in range(gpus)}
else: # Estimating the possible memory requirements
max_memory_all_gpus = get_max_memory()
if "cpu" in max_memory_all_gpus:
del max_memory_all_gpus["cpu"]
max_memory_per_gpu_map = {
k: v
for k, v in max_memory_all_gpus.items()
if k % num_local_processes == (self.accelerator.process_index % num_local_processes)
}
args["max_memory"] = max_memory_per_gpu_map
args["device_map"] = "auto"
eval_logger.info(
f"Model parallel was set to True, setting max memory per GPU to {max_memory_per_gpu_map} and device map to 'auto'"
)
if max_cpu_memory is not None:
max_memory["cpu"] = max_cpu_memory
args["offload_folder"] = offload_folder
elif device_map is None: # No model parallelism, we use the default provided device for our model
if hasattr(self, "accelerator"):
device_map = {"": f"{self.accelerator.device}"}
else:
device_map = {"": str(self.device)}
args["max_memory"] = None
args["device_map"] = device_map
eval_logger.info(
f"Model parallel was set to False, max memory was not set, and device map was set to {device_map}"
)
else:
args["max_memory"] = None
args["device_map"] = None
eval_logger.info("Model parallel was set to False.")
return args
@property
def config(self):
# return the associated transformers.AutoConfig for the given pretrained model.
......@@ -487,11 +513,13 @@ class HFLM(TemplateLM):
revision: str = "main",
trust_remote_code: bool = False,
) -> None:
self._config = transformers.AutoConfig.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
)
with self.accelerator.main_process_first():
self._config = transformers.AutoConfig.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
force_download=False,
)
def _create_model(
self,
......@@ -504,7 +532,6 @@ class HFLM(TemplateLM):
# (accelerate naive PP (device_map) options)
parallelize: Optional[bool] = False,
gpus: Optional[int] = None,
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
......@@ -528,25 +555,16 @@ class HFLM(TemplateLM):
model_kwargs = kwargs if kwargs else {}
if parallelize:
model_kwargs.update(
_get_accelerate_args(
device_map_option, # TODO: phase out device_map_option?
max_memory_per_gpu,
max_cpu_memory,
offload_folder,
gpus,
)
model_kwargs.update(
self._get_accelerate_args(
parallelize=parallelize,
device_map=kwargs.get("device_map", None),
max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory,
offload_folder=offload_folder,
gpus=gpus,
)
elif "device_map" not in model_kwargs:
# set a device_map to initialize model on the right GPU.
# this is needed because it seems that the default behavior
# for quantized models now seems to be device_map="auto"
# which breaks data-parallel mode.
if hasattr(self, "accelerator"):
model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
else:
model_kwargs.update({"device_map": {"": str(self.device)}})
)
if not autogptq:
if model_kwargs.get("load_in_4bit", None):
......@@ -559,13 +577,17 @@ class HFLM(TemplateLM):
model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
model_kwargs["bnb_4bit_compute_dtype"]
)
self._model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained,
revision=revision,
torch_dtype=get_dtype(dtype),
trust_remote_code=trust_remote_code,
**model_kwargs,
)
with self.accelerator.main_process_first():
#model_kwargs["device_map"] = "balanced_low_0"
self._model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained,
revision=revision,
torch_dtype=get_dtype(dtype),
trust_remote_code=trust_remote_code,
force_download=False,
**model_kwargs,
)
else:
try:
from auto_gptq import AutoGPTQForCausalLM
......@@ -657,6 +679,7 @@ class HFLM(TemplateLM):
revision=revision,
trust_remote_code=trust_remote_code,
use_fast=use_fast_tokenizer,
force_download=False
)
else:
assert isinstance(
......@@ -678,43 +701,57 @@ class HFLM(TemplateLM):
)
return None
def _detect_batch_size(self, requests=None, pos: int = 0):
if requests:
def _detect_batch_size(self, requests=None, pos: int = 0) -> int:
if len(requests[0]) == 3: # logprob evals
_, context_enc, continuation_enc = requests[pos]
max_length = len(
(context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
)
max_context_enc = len(context_enc[-(self.max_length + 1) :])
max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
else:
max_length = self.max_length
security_margin_factor = 6 # batch sizes for log prob evals sometimes generate OOMs
elif len(requests[0]) == 2: # generative evals
# using rolling window with maximum context
longest_context = max([len(self.tok_encode(request[0])) + request[1].get("max_gen_toks", self.max_length) for request in requests[pos:]])
if longest_context > self.max_length:
eval_logger.warning(
f"Longest context length of {longest_context} exceeds max_length of {self.max_length}. Truncating to max_length."
)
longest_context = self.max_length
max_length = longest_context
max_context_enc = max_length
max_cont_enc = max_length
security_margin_factor = 6
# if OOM, then halves batch_size and tries again
@find_executable_batch_size(starting_batch_size=self.max_batch_size)
def forward_batch(batch_size):
security_margin = int(0.05 * security_margin_factor * batch_size)
if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
length = max(max_context_enc, max_cont_enc)
batched_conts = torch.ones(
(batch_size, length), device=self.device
(batch_size + security_margin, length), device=self.device
).long()
test_batch = torch.ones((batch_size, length), device=self.device).long()
test_batch = torch.ones((batch_size + security_margin, length), device=self.device).long()
call_kwargs = {
"attn_mask": test_batch,
"labels": batched_conts,
}
else:
call_kwargs = {}
test_batch = torch.ones(
(batch_size, max_length), device=self.device
test_batch = torch.rand(
(batch_size + security_margin, max_length), device=self.device
).long()
for _ in range(5):
out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1) # noqa: F841
for _ in range(5*security_margin_factor):
logits = self._model_call(inps=test_batch, **call_kwargs).float()
scores = F.log_softmax(logits, dim=-1) # noqa: F841
return batch_size
try:
print(f"finding batch size on process {self.accelerator.local_process_index}")
batch_size = forward_batch()
except RuntimeError as e:
if "No executable batch size found" in str(e):
......@@ -725,6 +762,7 @@ class HFLM(TemplateLM):
if self.world_size > 1:
# if multi-GPU, always take minimum over all selected batch sizes
max_rnk_bs = torch.tensor([batch_size], device=self.device)
print(f"gathering on process {self.accelerator.local_process_index}")
gathered = (
self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
)
......@@ -945,6 +983,10 @@ class HFLM(TemplateLM):
print(f"Determined largest batch size: {self.batch_sizes[sched]}")
return self.batch_sizes[sched]
def _reset_batch_scheduler(self):
"""When we change group in generative evaluations, we reset the batch size"""
self.batch_sizes = {}
def _loglikelihood_tokens(
self,
requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
......@@ -1002,7 +1044,7 @@ class HFLM(TemplateLM):
else None
)
chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn, accelerator=self.accelerator)
pbar = tqdm(
total=len(requests),
disable=(disable_tqdm or (self.rank != 0)),
......@@ -1022,6 +1064,8 @@ class HFLM(TemplateLM):
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
from pprint import pprint
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
......@@ -1104,7 +1148,7 @@ class HFLM(TemplateLM):
}
multi_logits = F.log_softmax(
self._model_call(batched_inps, **call_kwargs), dim=-1
self._model_call(batched_inps, **call_kwargs), dim=-1, dtype=torch.float16
) # [batch, padding_length (inp or cont), vocab]
for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip(
......@@ -1166,6 +1210,8 @@ class HFLM(TemplateLM):
) -> List[str]:
res = []
self.accelerator.wait_for_everyone()
def _collate(req: Tuple[str, dict]):
"""Defines the key for the sorted method"""
# the negative sign on len(toks) sorts descending - this has a few advantages:
......@@ -1182,24 +1228,14 @@ class HFLM(TemplateLM):
disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests",
)
adaptive_batch_size = None
if self.batch_size == "auto":
# using rolling window with maximum context
print("Passed argument batch_size = auto. Detecting largest batch size")
batch_size = self._detect_batch_size()
print(f"Determined Largest batch size: {batch_size}")
adaptive_batch_size = batch_size
# for each different set of kwargs, we execute all requests, by batch.
batch_size = (
self.batch_size
if self.batch_size != "auto"
else adaptive_batch_size
if adaptive_batch_size is not None
else 0
)
batch_fn = (
self._batch_scheduler
if self.batch_size == "auto" and not adaptive_batch_size
if self.batch_size == "auto" # and not adaptive_batch_size
else None
)
......@@ -1213,7 +1249,7 @@ class HFLM(TemplateLM):
group_by="gen_kwargs",
group_fn=lambda x: x[1],
)
chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn, reset_batch_fn=self._reset_batch_scheduler)
for chunk in chunks:
contexts, all_gen_kwargs = zip(*chunk)
# we assume all gen kwargs in the batch are the same
......@@ -1243,6 +1279,8 @@ class HFLM(TemplateLM):
until.append(eos)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
if max_gen_toks > self.max_length:
max_gen_toks = self.max_gen_toks
else:
max_gen_toks = self.max_gen_toks
......@@ -1250,6 +1288,9 @@ class HFLM(TemplateLM):
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
# max len for inputs = max length, minus room to generate the max new tokens
max_ctx_len = self.max_length - max_gen_toks
while max_ctx_len <= 0:
max_gen_toks = max_gen_toks // 2
max_ctx_len = self.max_length - max_gen_toks
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
# max len for inputs = encoder's whole max_length
max_ctx_len = self.max_length
......@@ -1304,9 +1345,21 @@ class HFLM(TemplateLM):
"""
Method to apply a chat template to a list of chat history between user and model.
"""
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
try:
chat_templated = self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
except jinja2.exceptions.TemplateError:
eval_logger.warning(
"Failed to apply chat template. removing the system role in chat history."
)
chat_history = [msg for msg in chat_history if msg["role"] != "system"]
chat_templated = self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
return chat_templated
def get_model_info(self) -> dict:
"""
......
......@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM):
self.vocab_size = self.tokenizer.vocab_size
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = add_bos_token
self.add_bos_token = self.add_bos_token
self._max_length = max_length
......
"""TextSynth API
""" TextSynth API
Implementation provided by Fabrice Bellard:
https://github.com/EleutherAI/lm-evaluation-harness/issues/295
......@@ -11,7 +11,6 @@ Example usage:
Homepage: https://textsynth.com/index.html
"""
import logging
import os
......
......@@ -389,7 +389,7 @@ class Collator:
self._arr_with_indices, fn=self._group_fn, group_by="contexts"
)
def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None, reset_batch_fn: Optional[Callable] = None, accelerator=None) -> Iterator:
"""
Generates and yields batches from the reordered array. The method of grouping and batching
depends on the parameter `group_by`.
......@@ -402,6 +402,8 @@ class Collator:
- n (int): The size of each batch. Defaults to 1.
- batch_fn ([Callable[[int, Iterable], int]] | None): A function to determine the size of
each batch. Optional, defaults to None.
- reset_batch_fn ([Callable[[int, Iterable], int]] | None): A function to reset the scheduler of
the batch_fn, if present, when we change group in generative mode.
Returns:
Iterator: An iterator over batches of reordered elements grouped as per the `group_by`
......@@ -411,10 +413,9 @@ class Collator:
List of batched elements according to the `group_by` attribute.
"""
if self._group_by == "gen_kwargs":
for (
key,
values,
) in self._arr_with_indices.items(): # type: ignore
for key, values in self._arr_with_indices.items(): # type: ignore
if reset_batch_fn is not None: # with each group change, we must recompute the batch size, so we restart the scheduler
reset_batch_fn()
values = self._reorder(values)
batch = self.get_chunks(values, n=n, fn=batch_fn)
yield from batch
......
......@@ -119,12 +119,6 @@ class VLLM(TemplateLM):
tokenizer_revision=tokenizer_revision,
)
self.add_bos_token = add_bos_token
if "gemma" in pretrained.lower():
self.add_bos_token = True
eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it."
)
self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None:
eval_logger.info(
......@@ -499,10 +493,7 @@ class VLLM(TemplateLM):
def modify_gen_kwargs(kwargs: dict) -> dict:
# sampling_params
do_sample = kwargs.pop("do_sample", None)
if do_sample is False and "temperature" not in kwargs:
eval_logger.debug(
"Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
)
if do_sample is False or "temperature" not in kwargs:
kwargs["temperature"] = 0.0
# hf defaults
kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
......
......@@ -10,8 +10,8 @@
| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
| [ammlu](ammlu/README.md) | Arabic version of MMLU. | Arabic |
| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English |
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
......@@ -20,13 +20,11 @@
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
......@@ -73,7 +71,6 @@
| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
......
......@@ -14,43 +14,27 @@ class TaskManager:
"""
def __init__(
self,
verbosity="INFO",
include_path: Optional[Union[str, List]] = None,
include_defaults: bool = True,
) -> None:
def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
self.verbosity = verbosity
self.include_path = include_path
self.logger = utils.eval_logger
self.logger.setLevel(getattr(logging, f"{verbosity}"))
self._task_index = self.initialize_tasks(
include_path=include_path, include_defaults=include_defaults
)
self._task_index = self.initialize_tasks(include_path=include_path)
self._all_tasks = sorted(list(self._task_index.keys()))
self.task_group_map = collections.defaultdict(list)
def initialize_tasks(
self,
include_path: Optional[Union[str, List]] = None,
include_defaults: bool = True,
):
def initialize_tasks(self, include_path: Optional[str] = None):
"""Creates a dictionary of tasks index.
:param include_path: Union[str, List] = None
An additional path to be searched for tasks recursively.
Can provide more than one such path as a list.
:param include_defaults: bool = True
If set to false, default tasks (those in lm_eval/tasks/) are not indexed.
:param include_path: str = None
An additional path to be searched for tasks
:return
Dictionary of task names as key and task metadata
"""
if include_defaults:
all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
else:
all_paths = []
all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
if include_path is not None:
if isinstance(include_path, str):
include_path = [include_path]
......@@ -312,13 +296,8 @@ class TaskManager:
:return
Dictionary of task names as key and task metadata
"""
ignore_dirs = [
"__pycache__",
".ipynb_checkpoints",
]
tasks_and_groups = collections.defaultdict()
for root, dirs, file_list in os.walk(task_dir):
dirs[:] = [d for d in dirs if d not in ignore_dirs]
for root, _, file_list in os.walk(task_dir):
for f in file_list:
if f.endswith(".yaml"):
yaml_path = os.path.join(root, f)
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
import re
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment