Unverified Commit a2af2101 authored by Yen-Ting Lin's avatar Yen-Ting Lin Committed by GitHub
Browse files

Merge branch 'EleutherAI:main' into main

parents 82cb25c1 d5f39bf8
......@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter):
def filter_set(inst):
filtered_resp = []
for resp in inst:
if resp.startswith(" "):
resp = resp[1:]
resp = resp.lstrip()
filtered_resp.append(resp)
return filtered_resp
filtered_resps = [filter_set(resp) for resp in resps]
......
import json
import os
import re
import time
from collections import defaultdict
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from huggingface_hub import HfApi
from datasets import load_dataset
from datasets.utils.metadata import MetadataConfigs
from huggingface_hub import (
DatasetCard,
DatasetCardData,
HfApi,
hf_hub_url,
)
from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
from lm_eval.utils import (
eval_logger,
get_file_datetime,
get_file_task_name,
get_results_filenames,
get_sample_results_filenames,
handle_non_serializable,
hash_string,
sanitize_list,
sanitize_model_name,
sanitize_task_name,
)
......@@ -31,6 +48,11 @@ class GeneralConfigTracker:
model_source: str = None
model_name: str = None
model_name_sanitized: str = None
system_instruction: str = None
system_instruction_sha: str = None
fewshot_as_multiturn: bool = None
chat_template: str = None
chat_template_sha: str = None
start_time: float = None
end_time: float = None
total_evaluation_time_seconds: str = None
......@@ -59,13 +81,21 @@ class GeneralConfigTracker:
self,
model_source: str,
model_args: str,
system_instruction: str,
chat_template: str,
fewshot_as_multiturn: bool,
) -> None:
"""Logs model parameters and job ID."""
self.model_source = model_source
self.model_name = GeneralConfigTracker._get_model_name(model_args)
self.model_name_sanitized = re.sub(
r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
self.model_name_sanitized = sanitize_model_name(self.model_name)
self.system_instruction = system_instruction
self.system_instruction_sha = (
hash_string(system_instruction) if system_instruction else None
)
self.chat_template = chat_template
self.chat_template_sha = hash_string(chat_template) if chat_template else None
self.fewshot_as_multiturn = fewshot_as_multiturn
def log_end_time(self) -> None:
"""Logs the end time of the evaluation and calculates the total evaluation time."""
......@@ -81,37 +111,81 @@ class EvaluationTracker:
def __init__(
self,
output_path: str = "",
output_path: str = None,
hub_results_org: str = "",
hub_repo_name: str = "",
details_repo_name: str = "",
results_repo_name: str = "",
push_results_to_hub: bool = False,
push_samples_to_hub: bool = False,
public_repo: bool = False,
token: str = "",
leaderboard_url: str = "",
point_of_contact: str = "",
gated: bool = False,
) -> None:
"""
Creates all the necessary loggers for evaluation tracking.
Args:
output_path (str): Path to save the results. If not provided, the results won't be saved.
hub_results_org (str): The Hugging Face organisation to push the results to. If not provided, the results won't be pushed.
hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
public_repo (bool): Whether to push the results to a public or private repository.
token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
point_of_contact (str): Contact information on the Hugging Face hub dataset card.
gated (bool): Whether to gate the repository.
"""
self.general_config_tracker = GeneralConfigTracker()
self.output_path = output_path
self.hub_results_org = hub_results_org
hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
self.push_results_to_hub = push_results_to_hub
self.push_samples_to_hub = push_samples_to_hub
self.public_repo = public_repo
self.leaderboard_url = leaderboard_url
self.point_of_contact = point_of_contact
self.api = HfApi(token=token) if token else None
self.gated_repo = gated
if not self.api and (push_results_to_hub or push_samples_to_hub):
raise ValueError(
"Hugging Face token is not defined, but 'push_results_to_hub' or 'push_samples_to_hub' is set to True. "
"Please provide a valid Hugging Face token by setting the HF_TOKEN environment variable."
)
if (
self.api
and hub_results_org == ""
and (push_results_to_hub or push_samples_to_hub)
):
hub_results_org = self.api.whoami()["name"]
eval_logger.warning(
f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
)
if hub_repo_name == "":
details_repo_name = (
details_repo_name if details_repo_name != "" else "lm-eval-results"
)
results_repo_name = (
results_repo_name if results_repo_name != "" else details_repo_name
)
else:
details_repo_name = hub_repo_name
results_repo_name = hub_repo_name
eval_logger.warning(
"hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
)
self.details_repo = f"{hub_results_org}/{details_repo_name}"
self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
self.results_repo = f"{hub_results_org}/{results_repo_name}"
self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"
def save_results_aggregated(
self,
......@@ -160,23 +234,33 @@ class EvaluationTracker:
file_results_aggregated.open("w", encoding="utf-8").write(dumped)
if self.api and self.push_results_to_hub:
self.api.create_repo(
repo_id=self.hub_results_repo
repo_id = (
self.results_repo
if self.public_repo
else self.hub_results_repo_private,
else self.results_repo_private
)
self.api.create_repo(
repo_id=repo_id,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
self.api.upload_file(
repo_id=repo_id,
path_or_fileobj=str(
path.joinpath(f"results_{self.date_id}.json")
),
path_in_repo=os.path.join(
self.general_config_tracker.model_name,
f"results_{self.date_id}.json",
),
repo_type="dataset",
commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
)
eval_logger.info(
"Successfully pushed aggregated results to the Hugging Face Hub. "
f"You can find them at: {repo_id}"
)
except Exception as e:
eval_logger.warning("Could not save results aggregated")
......@@ -200,44 +284,238 @@ class EvaluationTracker:
"""
if self.output_path:
try:
eval_logger.info("Saving samples results")
samples_dumped = json.dumps(
samples,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
eval_logger.info(f"Saving per-sample results for: {task_name}")
path = Path(self.output_path if self.output_path else Path.cwd())
path = path.joinpath(self.general_config_tracker.model_name_sanitized)
path.mkdir(parents=True, exist_ok=True)
file_results_samples = path.joinpath(
f"samples_{task_name}_{self.date_id}.json"
f"samples_{task_name}_{self.date_id}.jsonl"
)
file_results_samples.write_text(samples_dumped, encoding="utf-8")
for sample in samples:
# we first need to sanitize arguments and resps
# otherwise we won't be able to load the dataset
# using the datasets library
arguments = {}
for i, arg in enumerate(sample["arguments"]):
arguments[f"gen_args_{i}"] = {}
for j, tmp in enumerate(arg):
arguments[f"gen_args_{i}"][f"arg_{j}"] = tmp
sample["resps"] = sanitize_list(sample["resps"])
sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
sample["arguments"] = arguments
sample["target"] = str(sample["target"])
sample_dump = (
json.dumps(
sample,
default=handle_non_serializable,
ensure_ascii=False,
)
+ "\n"
)
with open(file_results_samples, "a", encoding="utf-8") as f:
f.write(sample_dump)
if self.api and self.push_samples_to_hub:
self.api.create_repo(
self.hub_results_repo
repo_id = (
self.details_repo
if self.public_repo
else self.hub_results_repo_private,
else self.details_repo_private
)
self.api.create_repo(
repo_id=repo_id,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
try:
if self.gated_repo:
headers = build_hf_headers()
r = get_session().put(
url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
headers=headers,
json={"gated": "auto"},
)
hf_raise_for_status(r)
except Exception as e:
eval_logger.warning("Could not gate the repository")
eval_logger.info(repr(e))
self.api.upload_folder(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
repo_id=repo_id,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
repo_type="dataset",
commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
)
eval_logger.info(
f"Successfully pushed sample results for task: {task_name} to the Hugging Face Hub. "
f"You can find them at: {repo_id}"
)
except Exception as e:
eval_logger.warning("Could not save sample results")
eval_logger.info(repr(e))
else:
eval_logger.info("Output path not provided, skipping saving sample results")
def recreate_metadata_card(self) -> None:
"""
Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
"""
eval_logger.info("Recreating metadata card")
repo_id = self.details_repo if self.public_repo else self.details_repo_private
files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
results_files = get_results_filenames(files_in_repo)
sample_files = get_sample_results_filenames(files_in_repo)
# Build a dictionary to store the latest evaluation datetime for:
# - Each tested model and its aggregated results
# - Each task and sample results, if existing
# i.e. {
# "org__model_name__gsm8k": "2021-09-01T12:00:00",
# "org__model_name__ifeval": "2021-09-01T12:00:00",
# "org__model_name__results": "2021-09-01T12:00:00"
# }
latest_task_results_datetime = defaultdict(lambda: datetime.min.isoformat())
for file_path in sample_files:
file_path = Path(file_path)
filename = file_path.name
model_name = file_path.parent
task_name = get_file_task_name(filename)
results_datetime = get_file_datetime(filename)
task_name_sanitized = sanitize_task_name(task_name)
# Results and sample results for the same model and task will have the same datetime
samples_key = f"{model_name}__{task_name_sanitized}"
results_key = f"{model_name}__results"
latest_datetime = max(
latest_task_results_datetime[samples_key],
results_datetime,
)
latest_task_results_datetime[samples_key] = latest_datetime
latest_task_results_datetime[results_key] = max(
latest_task_results_datetime[results_key],
latest_datetime,
)
# Create metadata card
card_metadata = MetadataConfigs()
# Add the latest aggregated results to the metadata card for easy access
for file_path in results_files:
file_path = Path(file_path)
results_filename = file_path.name
model_name = file_path.parent
eval_date = get_file_datetime(results_filename)
eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
results_filename = Path("**") / Path(results_filename).name
config_name = f"{model_name}__results"
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
# Add the tasks details configs
for file_path in sample_files:
file_path = Path(file_path)
filename = file_path.name
model_name = file_path.parent
task_name = get_file_task_name(filename)
eval_date = get_file_datetime(filename)
task_name_sanitized = sanitize_task_name(task_name)
eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
results_filename = Path("**") / Path(filename).name
config_name = f"{model_name}__{task_name_sanitized}"
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
# Get latest results and extract info to update metadata card examples
latest_datetime = max(latest_task_results_datetime.values())
latest_model_name = max(
latest_task_results_datetime, key=lambda k: latest_task_results_datetime[k]
)
last_results_file = [
f for f in results_files if latest_datetime.replace(":", "-") in f
][0]
last_results_file_path = hf_hub_url(
repo_id=repo_id, filename=last_results_file, repo_type="dataset"
)
latest_results_file = load_dataset(
"json", data_files=last_results_file_path, split="train"
)
results_dict = latest_results_file["results"][0]
new_dictionary = {"all": results_dict}
new_dictionary.update(results_dict)
results_string = json.dumps(new_dictionary, indent=4)
dataset_summary = (
"Dataset automatically created during the evaluation run of model "
)
if self.general_config_tracker.model_source == "hf":
dataset_summary += f"[{self.general_config_tracker.model_name}](https://huggingface.co/{self.general_config_tracker.model_name})\n"
else:
dataset_summary += f"{self.general_config_tracker.model_name}\n"
dataset_summary += (
f"The dataset is composed of {len(card_metadata)-1} configuration(s), each one corresponding to one of the evaluated task.\n\n"
f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n'
'An additional configuration "results" store all the aggregated results of the run.\n\n'
"To load the details from a run, you can for instance do the following:\n"
)
if self.general_config_tracker.model_source == "hf":
dataset_summary += (
"```python\nfrom datasets import load_dataset\n"
f'data = load_dataset(\n\t"{repo_id}",\n\tname="{latest_model_name}",\n\tsplit="latest"\n)\n```\n\n'
)
dataset_summary += (
"## Latest results\n\n"
f'These are the [latest results from run {latest_datetime}]({last_results_file_path.replace("/resolve/", "/blob/")}) '
"(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
'You find each in the results and the "latest" split for each eval):\n\n'
f"```python\n{results_string}\n```"
)
card_data = DatasetCardData(
dataset_summary=dataset_summary,
repo_url=f"https://huggingface.co/{self.general_config_tracker.model_name}",
pretty_name=f"Evaluation run of {self.general_config_tracker.model_name}",
leaderboard_url=self.leaderboard_url,
point_of_contact=self.point_of_contact,
)
card_metadata.to_dataset_card_data(card_data)
card = DatasetCard.from_template(
card_data,
pretty_name=card_data.pretty_name,
)
card.push_to_hub(repo_id, repo_type="dataset")
......@@ -110,3 +110,34 @@ def add_env_info(storage: Dict[str, Any]):
"upper_git_hash": upper_dir_commit, # in case this repo is submodule
}
storage.update(added_info)
def add_tokenizer_info(storage: Dict[str, Any], lm):
if getattr(lm, "tokenizer", False):
try:
tokenizer_info = {
"tokenizer_pad_token": [
lm.tokenizer.pad_token,
str(lm.tokenizer.pad_token_id),
],
"tokenizer_eos_token": [
lm.tokenizer.eos_token,
str(lm.tokenizer.eos_token_id),
],
"tokenizer_bos_token": [
lm.tokenizer.bos_token,
str(lm.tokenizer.bos_token_id),
],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
except Exception as err:
logger.debug(
f"Logging detailed tokenizer info failed with {err}, skipping..."
)
# seems gguf and textsynth do not have tokenizer
else:
logger.debug(
"LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
)
......@@ -7,7 +7,7 @@ import numpy as np
import pandas as pd
from packaging.version import Version
from lm_eval.logging.utils import _handle_non_serializable, remove_none_pattern
from lm_eval.loggers.utils import _handle_non_serializable, remove_none_pattern
logger = logging.getLogger(__name__)
......
......@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
# defaults to os.environ.get("ANTHROPIC_API_KEY")
self.client = anthropic.Anthropic()
self.temperature = temperature
self.max_token = max_tokens
self.max_tokens = max_tokens
self.tokenizer = self.client.get_tokenizer()
self.kwargs = kwargs
......
......@@ -2,7 +2,7 @@ import copy
import os
from datetime import timedelta
from pathlib import Path
from typing import List, Literal, Optional, Tuple, Union
from typing import Dict, List, Literal, Optional, Tuple, Union
import torch
import torch.nn.functional as F
......@@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.utils import (
Collator,
clear_torch_cache,
configure_pad_token,
get_dtype,
pad_and_concat,
stop_sequences_criteria,
......@@ -44,13 +45,13 @@ def _get_accelerate_args(
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
gpus: Optional[int] = None,
) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
max_memory = {}
if max_memory_per_gpu is not None:
max_memory_per_gpu_map = {
device_idx: max_memory_per_gpu
for device_idx in range(torch.cuda.device_count())
device_idx: max_memory_per_gpu for device_idx in range(gpus)
}
max_memory.update(max_memory_per_gpu_map)
if max_cpu_memory is not None:
......@@ -153,12 +154,16 @@ class HFLM(TemplateLM):
if accelerator.num_processes > 1:
self.accelerator = accelerator
if "npu" in accelerator.device.type:
gpus = torch.npu.device_count()
if not (parallelize or accelerator.num_processes > 1):
# use user-passed device
device_list = set(
["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+ [f"cuda:{i}" for i in range(gpus)]
+ ["mps", "mps:0"]
+ [f"npu:{i}" for i in range(gpus)]
)
if device and device in device_list:
self._device = torch.device(device)
......@@ -199,6 +204,15 @@ class HFLM(TemplateLM):
config=self.config, backend=backend, trust_remote_code=trust_remote_code
)
# load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
self._create_tokenizer(
pretrained,
tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast_tokenizer=use_fast_tokenizer,
)
# if we passed `pretrained` as a string, initialize our model now
if isinstance(pretrained, str):
self._create_model(
......@@ -207,6 +221,7 @@ class HFLM(TemplateLM):
dtype=dtype,
trust_remote_code=trust_remote_code,
parallelize=parallelize,
gpus=gpus,
device_map_option=device_map_option,
max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory,
......@@ -235,44 +250,14 @@ class HFLM(TemplateLM):
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
)
self._create_tokenizer(
pretrained,
tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast_tokenizer=use_fast_tokenizer,
)
self.truncation = truncation
self.logits_cache = logits_cache
self.vocab_size = self.tokenizer.vocab_size
# select (or create) a pad token to use
if self.tokenizer.pad_token:
pass
elif self.tokenizer.unk_token:
self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
elif self.tokenizer.eos_token:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
else:
if getattr(self.config, "model_type", None) == "qwen":
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
self.tokenizer.pad_token = "<|endoftext|>"
elif (
self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
):
# The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
# The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
# ---
# Note that the world tokenizer class name, might change in the future for the final huggingface merge
# https://github.com/huggingface/transformers/pull/26963
assert self.tokenizer.pad_token_id == 0
else:
self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
# TODO: override this for Gemma
self.add_bos_token = add_bos_token
if getattr(self.config, "model_type", None) == "gemma":
if getattr(self.config, "model_type", None) in ["gemma", "gemma2"]:
self.add_bos_token = True
eval_logger.info(
f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
......@@ -321,6 +306,7 @@ class HFLM(TemplateLM):
in [
DistributedType.FSDP,
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
]
), "Unsupported distributed type provided. Only DDP and FSDP are supported."
if accelerator.distributed_type == DistributedType.FSDP:
......@@ -329,9 +315,7 @@ class HFLM(TemplateLM):
self._model = accelerator.prepare_model(
self.model, evaluation_mode=True
)
self._device = torch.device(
f"cuda:{accelerator.local_process_index}"
)
self._device = torch.device(f"{accelerator.device}")
self.accelerator = accelerator
if self.accelerator.is_local_main_process:
......@@ -414,6 +398,16 @@ class HFLM(TemplateLM):
def world_size(self):
return self._world_size
@property
def tokenizer_name(self) -> str:
return self.tokenizer.name_or_path.replace("/", "__")
@property
def chat_template(self) -> str:
if self.tokenizer.chat_template is not None:
return self.tokenizer.chat_template
return self.tokenizer.default_chat_template
def _get_backend(
self,
config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
......@@ -488,6 +482,7 @@ class HFLM(TemplateLM):
# only used if `parallelize=True`.
# (accelerate naive PP (device_map) options)
parallelize: Optional[bool] = False,
gpus: Optional[int] = None,
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
......@@ -519,6 +514,7 @@ class HFLM(TemplateLM):
max_memory_per_gpu,
max_cpu_memory,
offload_folder,
gpus,
)
)
elif "device_map" not in model_kwargs:
......@@ -527,9 +523,7 @@ class HFLM(TemplateLM):
# for quantized models now seems to be device_map="auto"
# which breaks data-parallel mode.
if hasattr(self, "accelerator"):
model_kwargs.update(
{"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
)
model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
else:
model_kwargs.update({"device_map": {"": str(self.device)}})
......@@ -579,6 +573,12 @@ class HFLM(TemplateLM):
if model_kwargs.get("load_in_4bit", None):
if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
raise AssertionError("load_in_4bit requires peft >= 0.4.0")
if self._model.config.vocab_size != len(self.tokenizer):
# resize model for LoRAs with added tokens
self._model.resize_token_embeddings(len(self.tokenizer))
eval_logger.info(
f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
)
self._model = PeftModel.from_pretrained(
self._model, peft, revision=revision
)
......@@ -667,6 +667,8 @@ class HFLM(TemplateLM):
max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
else:
max_length = self.max_length
max_context_enc = max_length
max_cont_enc = max_length
# if OOM, then halves batch_size and tries again
@find_executable_batch_size(starting_batch_size=self.max_batch_size)
......@@ -1277,6 +1279,14 @@ class HFLM(TemplateLM):
return res
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
"""
Method to apply a chat template to a list of chat history between user and model.
"""
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
def get_model_info(self) -> dict:
"""
Method to get Hugging Face model information for experiment reproducibility.
......
......@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM):
self.vocab_size = self.tokenizer.vocab_size
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = self.add_bos_token
self.add_bos_token = add_bos_token
self._max_length = max_length
......
""" TextSynth API
"""TextSynth API
Implementation provided by Fabrice Bellard:
https://github.com/EleutherAI/lm-evaluation-harness/issues/295
......@@ -11,6 +11,7 @@ Example usage:
Homepage: https://textsynth.com/index.html
"""
import logging
import os
......
......@@ -5,6 +5,7 @@ import itertools
import time
from functools import wraps
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
......@@ -24,6 +25,11 @@ import transformers
from lm_eval.utils import eval_logger
if TYPE_CHECKING:
from transformers import PreTrainedTokenizerBase
from transformers.configuration_utils import PretrainedConfig
def chunks(iter, n: int = 0, fn=None):
"""
Divides an iterable into chunks of specified size or based on a given function.
......@@ -613,3 +619,48 @@ class Collator:
if arr:
yield arr
def configure_pad_token(
tokenizer: "PreTrainedTokenizerBase",
model_config: Optional["PretrainedConfig"] = None,
) -> "PreTrainedTokenizerBase":
"""
This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
Some tokenizers require special handling.
Args:
tokenizer: The tokenizer for which the padding token is to be handled.
model_config: The configuration of the model. Default is None.
Returns:
The tokenizer after the padding token has been handled.
Raises:
AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
"""
if tokenizer.pad_token:
pass
elif tokenizer.unk_token:
tokenizer.pad_token_id = tokenizer.unk_token_id
elif tokenizer.eos_token:
tokenizer.pad_token_id = tokenizer.eos_token_id
else:
# handle special cases
if model_config and getattr(model_config, "model_type", None) == "qwen":
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
tokenizer.pad_token = "<|endoftext|>"
elif (
tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
):
# The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
# The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
# ---
# Note that the world tokenizer class name, might change in the future for the final huggingface merge
# https://github.com/huggingface/transformers/pull/26963
assert tokenizer.pad_token_id == 0
else:
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
return tokenizer
import copy
from importlib.metadata import version
from importlib.util import find_spec
from typing import List, Literal, Optional, Tuple, Union
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
from more_itertools import distribute
from packaging.version import parse as parse_version
......@@ -10,7 +10,7 @@ from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator, undistribute
from lm_eval.models.utils import Collator, configure_pad_token, undistribute
from lm_eval.utils import (
eval_logger,
get_rolling_token_windows,
......@@ -21,13 +21,13 @@ from lm_eval.utils import (
try:
import ray
from vllm import LLM, SamplingParams
if parse_version(version("vllm")) > parse_version("0.3.0"):
from vllm.lora.request import LoRARequest
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import get_tokenizer
except ModuleNotFoundError:
pass
if TYPE_CHECKING:
pass
eval_logger = eval_logger
......@@ -102,9 +102,6 @@ class VLLM(TemplateLM):
if self.data_parallel_size <= 1:
self.model = LLM(**self.model_args)
else:
assert parse_version(version("vllm")) < parse_version(
"0.3.3"
), "data_parallel is only compatible with vllm < v0.3.3."
eval_logger.warning(
"You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
)
......@@ -123,7 +120,14 @@ class VLLM(TemplateLM):
trust_remote_code=trust_remote_code,
tokenizer_revision=tokenizer_revision,
)
self.tokenizer = configure_pad_token(self.tokenizer)
self.add_bos_token = add_bos_token
if "gemma" in pretrained.lower():
self.add_bos_token = True
eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
)
self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None:
eval_logger.info(
......@@ -175,23 +179,46 @@ class VLLM(TemplateLM):
def max_gen_toks(self):
return self._max_gen_toks
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
"""
Method to apply a chat template to a list of chat history between user and model.
"""
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
@property
def chat_template(self) -> str:
if self.tokenizer.chat_template is not None:
return self.tokenizer.chat_template
return self.tokenizer.default_chat_template
@property
def tokenizer_name(self) -> str:
return self.tokenizer.name_or_path.replace("/", "__")
def tok_encode(
self,
string: str,
left_truncate_len=None,
add_special_tokens=None,
truncation=False,
):
""" """
string: Union[str, List[str]],
left_truncate_len: int = None,
add_special_tokens: bool = False,
truncation: bool = False,
) -> Union[List[int], List[List[int]]]:
if not add_special_tokens:
add_special_tokens = False or self.add_bos_token
encoding = self.tokenizer.encode(
string, add_special_tokens=add_special_tokens, truncation=truncation
)
encoding: Union[List[List[int]], List[int]] = self.tokenizer(
string,
add_special_tokens=add_special_tokens,
truncation=truncation,
return_attention_mask=False,
).input_ids
# left-truncate the encoded context to be at most `left_truncate_len` tokens long
if left_truncate_len:
encoding = encoding[-left_truncate_len:]
if not isinstance(string, str):
encoding = [enc[-left_truncate_len:] for enc in encoding]
else:
encoding = encoding[-left_truncate_len:]
return encoding
......@@ -208,7 +235,7 @@ class VLLM(TemplateLM):
sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
else:
sampling_params = SamplingParams(
temperature=0, prompt_logprobs=1, max_tokens=1
temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
)
if self.data_parallel_size > 1:
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
......@@ -289,7 +316,9 @@ class VLLM(TemplateLM):
# batch tokenize contexts
context, all_gen_kwargs = zip(*(req.args for req in requests))
context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids
context_encoding: List[List[int]] = self.tok_encode(
context, add_special_tokens=self.add_bos_token
)
requests = [
((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
]
......@@ -498,7 +527,10 @@ class VLLM(TemplateLM):
def modify_gen_kwargs(kwargs: dict) -> dict:
# sampling_params
do_sample = kwargs.pop("do_sample", None)
if do_sample is False or "temperature" not in kwargs:
if do_sample is False and "temperature" not in kwargs:
eval_logger.debug(
"Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
)
kwargs["temperature"] = 0.0
# hf defaults
kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
......
# Tasks
A list of supported tasks and task groupings can be viewed with `lm-eval --tasks list`.
For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
| Task Family | Description | Language(s) |
|-------------|-------------|-------------|
| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English |
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
| csatqa | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean |
| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English |
| [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English |
| [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque |
| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque |
| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque |
| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque |
| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
| [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean |
| [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English |
| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English |
| [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
| [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
| [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
| [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
| [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English |
| [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
| [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
| medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
| medqa | Multiple choice question answering based on the United States Medical License Exams. | |
| [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
| [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
| mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigourous. | English |
| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
| [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
| [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
| [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) |
| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
| [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English |
| [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish |
| [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English |
| [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English |
| [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English |
| [qasper](qasper/README.md) | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English |
| [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English |
| realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | |
| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning. | English |
| [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English |
| [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English |
| [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English |
| [super_glue](super_glue/README.md) | A suite of challenging tasks designed to test a range of language understanding skills. | English |
| [swag](swag/README.md) | Situations With Adversarial Generations, predicting the next event in videos. | English |
| [swde](swde/README.md) | Information extraction tasks from semi-structured web pages. | English |
| [tinyBenchmarks](tinyBenchmarks/README.md) | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English |
| [tmmluplus](tmmluplus/README.md) | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese |
| [toxigen](toxigen/README.md) | Tasks designed to evaluate language models on their propensity to generate toxic content. | English |
| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |
| [wikitext](wikitext/README.md) | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English |
| [winogrande](winogrande/README.md) | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English |
| [wmdp](wmdp/README.md) | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English |
| [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish |
| [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English |
| [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese |
| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greekm English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
| [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
| [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
| [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
import collections
import inspect
import logging
import os
from functools import partial
from typing import Dict, List, Mapping, Optional, Union
from lm_eval import utils
from lm_eval.api.group import ConfigurableGroup, GroupConfig
from lm_eval.api.task import ConfigurableTask, Task
from lm_eval.evaluator_utils import get_subtask_list
GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
class TaskManager:
......@@ -14,27 +20,53 @@ class TaskManager:
"""
def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
def __init__(
self,
verbosity="INFO",
include_path: Optional[Union[str, List]] = None,
include_defaults: bool = True,
) -> None:
self.verbosity = verbosity
self.include_path = include_path
self.logger = utils.eval_logger
self.logger.setLevel(getattr(logging, f"{verbosity}"))
self._task_index = self.initialize_tasks(include_path=include_path)
self._task_index = self.initialize_tasks(
include_path=include_path, include_defaults=include_defaults
)
self._all_tasks = sorted(list(self._task_index.keys()))
self._all_groups = sorted(
[x for x in self._all_tasks if self._task_index[x]["type"] == "group"]
)
self._all_subtasks = sorted(
[x for x in self._all_tasks if self._task_index[x]["type"] == "task"]
)
self._all_tags = sorted(
[x for x in self._all_tasks if self._task_index[x]["type"] == "tag"]
)
self.task_group_map = collections.defaultdict(list)
def initialize_tasks(self, include_path: Optional[str] = None):
def initialize_tasks(
self,
include_path: Optional[Union[str, List]] = None,
include_defaults: bool = True,
):
"""Creates a dictionary of tasks index.
:param include_path: str = None
An additional path to be searched for tasks
:param include_path: Union[str, List] = None
An additional path to be searched for tasks recursively.
Can provide more than one such path as a list.
:param include_defaults: bool = True
If set to false, default tasks (those in lm_eval/tasks/) are not indexed.
:return
Dictionary of task names as key and task metadata
"""
all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
if include_defaults:
all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
else:
all_paths = []
if include_path is not None:
if isinstance(include_path, str):
include_path = [include_path]
......@@ -51,10 +83,88 @@ class TaskManager:
def all_tasks(self):
return self._all_tasks
@property
def all_groups(self):
return self._all_groups
@property
def all_subtasks(self):
return self._all_subtasks
@property
def all_tags(self):
return self._all_tags
@property
def task_index(self):
return self._task_index
def list_all_tasks(
self, list_groups=True, list_tags=True, list_subtasks=True
) -> str:
from pytablewriter import MarkdownTableWriter
def sanitize_path(path):
# don't print full path if we are within the lm_eval/tasks dir !
# if we aren't though, provide the full path.
if "lm_eval/tasks/" in path:
return "lm_eval/tasks/" + path.split("lm_eval/tasks/")[-1]
else:
return path
group_table = MarkdownTableWriter()
group_table.headers = ["Group", "Config Location"]
gt_values = []
for g in self.all_groups:
path = self.task_index[g]["yaml_path"]
if path == -1:
path = "---"
else:
path = sanitize_path(path)
gt_values.append([g, path])
group_table.value_matrix = gt_values
tag_table = MarkdownTableWriter()
tag_table.headers = ["Tag"]
tag_table.value_matrix = [[t] for t in self.all_tags]
subtask_table = MarkdownTableWriter()
subtask_table.headers = ["Task", "Config Location", "Output Type"]
st_values = []
for t in self.all_subtasks:
path = self.task_index[t]["yaml_path"]
output_type = ""
# read the yaml file to determine the output type
if path != -1:
config = utils.load_yaml_config(path, mode="simple")
if "output_type" in config:
output_type = config["output_type"]
elif (
"include" in config
): # if no output type, check if there is an include with an output type
include_path = path.split("/")[:-1] + config["include"]
include_config = utils.load_yaml_config(include_path, mode="simple")
if "output_type" in include_config:
output_type = include_config["output_type"]
if path == -1:
path = "---"
else:
path = sanitize_path(path)
st_values.append([t, path, output_type])
subtask_table.value_matrix = st_values
result = "\n"
if list_groups:
result += group_table.dumps() + "\n\n"
if list_tags:
result += tag_table.dumps() + "\n\n"
if list_subtasks:
result += subtask_table.dumps() + "\n\n"
return result
def match_tasks(self, task_list):
return utils.pattern_match(task_list, self.all_tasks)
......@@ -64,7 +174,12 @@ class TaskManager:
return False
def _name_is_task(self, name) -> bool:
if self._name_is_registered(name) and ("task" in self.task_index[name]["type"]):
if self._name_is_registered(name) and (self.task_index[name]["type"] == "task"):
return True
return False
def _name_is_tag(self, name) -> bool:
if self._name_is_registered(name) and (self.task_index[name]["type"] == "tag"):
return True
return False
......@@ -125,89 +240,126 @@ class TaskManager:
config["group_alias"] = None
return config
def _class_has_config_in_constructor(self, cls):
constructor = getattr(cls, "__init__", None)
return (
"config" in inspect.signature(constructor).parameters
if constructor
else False
)
def _load_individual_task_or_group(
self,
name_or_config: Optional[Union[str, dict]] = None,
parent_name: Optional[str] = None,
update_config: Optional[dict] = None,
yaml_path: Optional[str] = None,
) -> Mapping:
def load_task(config, task, group=None, yaml_path=None):
def _load_task(config, task):
if "include" in config:
if yaml_path is None:
raise ValueError
config = {
**utils.load_yaml_config(
yaml_path,
yaml_path=None,
yaml_config={"include": config.pop("include")},
mode="full",
),
**config,
}
if self._config_is_python_task(config):
task_object = config["class"]()
if self._class_has_config_in_constructor(config["class"]):
task_object = config["class"](config=config)
else:
task_object = config["class"]()
if isinstance(task_object, ConfigurableTask):
# very scuffed: set task name here. TODO: fixme?
task_object.config.task = config["task"]
else:
config = self._process_alias(config, group=group)
task_object = ConfigurableTask(config=config)
if group is not None:
task_object = (group, task_object)
return {task: task_object}
def _get_group_and_subtask_from_config(config):
group_name = ConfigurableGroup(config=config)
subtask_list = []
for task in group_name.config["task"]:
if isinstance(task, str) and self._name_is_tag(task):
subtask_list.extend(self._get_tasklist(task))
else:
subtask_list.append(task)
return group_name, subtask_list
def _process_group_config(config, update_config=None):
if update_config is not None:
config = {**config, **update_config}
_update_config = {
k: v for k, v in config.items() if k not in GROUP_ONLY_KEYS
}
if not bool(_update_config):
_update_config = None
group_config = {k: v for k, v in config.items() if k in GROUP_ONLY_KEYS}
return group_config, _update_config
if isinstance(name_or_config, str):
if update_config is not None:
# Process name_or_config as a dict instead
name_or_config = {"task": name_or_config, **update_config}
elif self._name_is_task(name_or_config):
elif self._name_is_task(name_or_config) or self._name_is_python_task(
name_or_config
):
task_config = self._get_config(name_or_config)
return load_task(task_config, task=name_or_config, group=parent_name)
return _load_task(task_config, task=name_or_config)
else:
group_name = name_or_config
subtask_list = self._get_tasklist(name_or_config)
if subtask_list == -1:
group_config = self._get_config(name_or_config)
subtask_list = group_config["task"]
# This checks if we're at the root.
if parent_name is None:
group_config = self._get_config(name_or_config)
if set(group_config.keys()) > {"task", "group"}:
update_config = {
k: v
for k, v in group_config.items()
if k not in ["task", "group"]
}
yaml_path = self._get_yaml_path(group_name)
if (update_config is not None) and ("group_alias" in update_config):
group_name = update_config["group_alias"]
update_config.pop("group_alias")
group_config, update_config = _process_group_config(group_config)
group_name, subtask_list = _get_group_and_subtask_from_config(
group_config
)
else:
if self._name_is_tag(name_or_config):
fn = partial(
self._load_individual_task_or_group,
update_config=name_or_config
if isinstance(name_or_config, dict)
else None,
)
return dict(
collections.ChainMap(*map(fn, reversed(subtask_list)))
)
else:
group_name = ConfigurableGroup(
config={"group": name_or_config, "task": subtask_list}
)
if isinstance(name_or_config, dict):
if update_config is not None:
name_or_config = {
**name_or_config,
**update_config,
}
if self._config_is_task(name_or_config):
name = name_or_config["task"]
name = name_or_config.pop("task")
if update_config is not None:
name_or_config = {**name_or_config, **update_config}
# If the name is registered as a group
# if self._name_is_task(name) is False:
if self._name_is_group(name):
group_name = name
update_config = {
k: v for k, v in name_or_config.items() if k != "task"
}
group_config = self._get_config(name)
group_config, update_config = _process_group_config(
group_config, name_or_config
)
group_name, subtask_list = _get_group_and_subtask_from_config(
group_config
)
elif self._name_is_tag(name):
subtask_list = self._get_tasklist(name)
if subtask_list == -1:
subtask_list = self._get_config(name)["task"]
fn = partial(
self._load_individual_task_or_group,
update_config=name_or_config,
)
return dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
else:
if self._name_is_registered(name):
base_task_config = self._get_config(name)
# Check if this is a duplicate.
if parent_name is not None:
name_or_config["group"] = parent_name
num_duplicate = len(
list(
filter(
......@@ -226,34 +378,21 @@ class TaskManager:
}
else:
task_config = name_or_config
return load_task(
task_config, task=name, group=parent_name, yaml_path=yaml_path
)
return _load_task(task_config, task=name)
else:
group_name = name_or_config["group"]
subtask_list = name_or_config["task"]
if set(name_or_config.keys()) > {"task", "group"}:
update_config = {
k: v
for k, v in name_or_config.items()
if k not in ["task", "group"]
}
all_subtasks = {}
if parent_name is not None:
all_subtasks = {group_name: (parent_name, None)}
group_config, update_config = _process_group_config(name_or_config)
group_name, subtask_list = _get_group_and_subtask_from_config(
group_config
)
fn = partial(
self._load_individual_task_or_group,
parent_name=group_name,
update_config=update_config,
yaml_path=yaml_path,
)
all_subtasks = {
**all_subtasks,
**dict(collections.ChainMap(*map(fn, subtask_list))),
return {
group_name: dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
}
return all_subtasks
def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
"""Loads a dictionary of task objects from a list
......@@ -277,10 +416,11 @@ class TaskManager:
def _get_task_and_group(self, task_dir: str):
"""Creates a dictionary of tasks index with the following metadata,
- `type`, that can be either `task`, `python_task`, or `group`.
- `type`, that can be either `task`, `python_task`, `group` or `tags`.
`task` refer to regular task configs, `python_task` are special
yaml files that only consists of `task` and `class` parameters.
`group` are group configs.
`group` are group configs. `tags` are labels that can be assigned
to tasks to assist in sorting and calling tasks of certain themes.
- `yaml_path`, path to the yaml file. If the entry is a `group` that
was configured through a task config, the yaml_path will be -1
and all subtasks will be listed in `task` (see below)
......@@ -296,8 +436,15 @@ class TaskManager:
:return
Dictionary of task names as key and task metadata
"""
# TODO: remove group in next release
print_info = True
ignore_dirs = [
"__pycache__",
".ipynb_checkpoints",
]
tasks_and_groups = collections.defaultdict()
for root, _, file_list in os.walk(task_dir):
for root, dirs, file_list in os.walk(task_dir):
dirs[:] = [d for d in dirs if d not in ignore_dirs]
for f in file_list:
if f.endswith(".yaml"):
yaml_path = os.path.join(root, f)
......@@ -337,20 +484,38 @@ class TaskManager:
"yaml_path": yaml_path,
}
if "group" in config:
groups = config["group"]
if isinstance(config["group"], str):
groups = [groups]
for group in groups:
if group not in tasks_and_groups:
tasks_and_groups[group] = {
"type": "group",
"task": [task],
"yaml_path": -1,
}
else:
tasks_and_groups[group]["task"].append(task)
# TODO: remove group in next release
for attr in ["tag", "group"]:
if attr in config:
if attr == "group" and print_info:
self.logger.info(
"`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
"`tag` will be used to allow to call a collection of tasks just like `group`. "
"`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
"which will be the offical way to create groups with addition of group-wide configuations."
)
print_info = False
# attr = "tag"
attr_list = config[attr]
if isinstance(attr_list, str):
attr_list = [attr_list]
for tag in attr_list:
if tag not in tasks_and_groups:
tasks_and_groups[tag] = {
"type": "tag",
"task": [task],
"yaml_path": -1,
}
elif tasks_and_groups[tag]["type"] != "tag":
self.logger.info(
f"The tag {tag} is already registered as a group, this tag will not be registered. "
"This may affect tasks you want to call."
)
break
else:
tasks_and_groups[tag]["task"].append(task)
else:
self.logger.debug(f"File {f} in {root} could not be loaded")
......@@ -379,6 +544,33 @@ def get_task_name_from_object(task_object):
)
def _check_duplicates(task_dict: dict) -> List[str]:
"""helper function solely used in validating get_task_dict output.
Takes the output of lm_eval.evaluator_utils.get_subtask_list and
returns a list of all leaf subtasks contained within, and errors if any such leaf subtasks are
"oversubscribed" to several disjoint groups.
"""
subtask_names = []
for key, value in task_dict.items():
subtask_names.extend(value)
duplicate_tasks = {
task_name for task_name in subtask_names if subtask_names.count(task_name) > 1
}
# locate the potentially problematic groups that seem to 'compete' for constituent subtasks
competing_groups = [
group
for group in task_dict.keys()
if len(set(task_dict[group]).intersection(duplicate_tasks)) > 0
]
if len(duplicate_tasks) > 0:
raise ValueError(
f"Found 1 or more tasks while trying to call get_task_dict() that were members of more than 1 called group: {list(duplicate_tasks)}. Offending groups: {competing_groups}. Please call groups which overlap their constituent tasks in separate evaluation runs."
)
def get_task_dict(
task_name_list: Union[str, List[Union[str, Dict, Task]]],
task_manager: Optional[TaskManager] = None,
......@@ -396,6 +588,7 @@ def get_task_dict(
:return
Dictionary of task objects
"""
task_name_from_string_dict = {}
task_name_from_config_dict = {}
task_name_from_object_dict = {}
......@@ -413,7 +606,9 @@ def get_task_dict(
)
string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]
others_task_name_list = [
task for task in task_name_list if not isinstance(task, str)
]
if len(string_task_name_list) > 0:
if task_manager is None:
task_manager = TaskManager()
......@@ -440,8 +635,16 @@ def get_task_dict(
):
raise ValueError
return {
final_task_dict = {
**task_name_from_string_dict,
**task_name_from_config_dict,
**task_name_from_object_dict,
}
# behavior can get odd if one tries to invoke several groups that "compete" for the same task.
# (notably, because one could request several num_fewshot values at once in GroupConfig overrides for the subtask
# and we'd be unsure which to use and report.)
# we explicitly check and error in this case.
_check_duplicates(get_subtask_list(final_task_dict))
return final_task_dict
......@@ -26,7 +26,7 @@ Homepage: https://github.com/isen-zhang/ACLUE
}
```
### Groups and Tasks
### Groups, Tags, and Tasks
#### Groups
......
group: aclue
task:
- aclue_ancient_chinese_culture
- aclue_ancient_literature
- aclue_ancient_medical
- aclue_ancient_phonetics
- aclue_basic_ancient_chinese
- aclue_couplet_prediction
- aclue_homographic_character_resolution
- aclue_named_entity_recognition
- aclue_poetry_appreciate
- aclue_poetry_context_prediction
- aclue_poetry_quality_assessment
- aclue_poetry_sentiment_analysis
- aclue_polysemy_resolution
- aclue_reading_comprehension
- aclue_sentence_segmentation
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
group: aclue
dataset_path: tyouisen/aclue
test_split: test
fewshot_split: dev
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
......
......@@ -24,11 +24,11 @@ Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomInt
### Citation
### Groups and Tasks
### Groups, Tags, and Tasks
#### Groups
- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
- `aexams`: Arabic EXAMS dataset, including IslamicStudies, Biology, Science, Physics, Social subjects.
#### Tasks
......
group: aexams
task:
- aexams_Biology
- aexams_IslamicStudies
- aexams_Physics
- aexams_Science
- aexams_Social
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
group: aexams
dataset_path: Hennara/aexams
test_split: test
fewshot_split: dev
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment