Unverified Commit f4f59251 authored by KonradSzafer's avatar KonradSzafer Committed by GitHub
Browse files

Add dataset card when pushing to HF hub (#1898)



* dataset card initial

* few fixes

* adds groups for math, mmlu, gpqa

* added summary agrs

* moved sanitize_list to utils

* readme update

* recreate metadata moved

* multiple model support

* results latest split fix

* readme update and small refactor

* fix grouping

* add comments

* added pathlib

* corrected pathlib approach

* check whether to create a metadata card

* convert posix paths to str

* default hf org from token

* hf token value error

* Add logs after successful upload

* logging updates

* dataset card example in the readme

---------
Co-authored-by: default avatarNathan Habib <nathan.habib@huggingface.com>
Co-authored-by: default avatarAlina Lozovskaia <alinailozovskaya@gmail.com>
parent 14221c84
......@@ -307,7 +307,7 @@ To save evaluation results provide an `--output_path`. We also support logging m
Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example output](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo/tree/main/microsoft__phi-2). For instance:
To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example dataset on the HF Hub](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo). For instance:
```bash
lm_eval --model hf \
......@@ -318,6 +318,13 @@ lm_eval --model hf \
--hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \
```
This allows you to easily download the results and samples from the Hub, using:
```python
from datasets import load_dataset
load_dataset("EleutherAI/lm-eval-results-private", "hellaswag", "latest")
```
For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!
## Visualizing Results
......
......@@ -51,11 +51,13 @@ This mode supports a number of command-line arguments, the details of which can
* `--wandb_args`: Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
* `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
* `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`,
* `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,
* `hub_repo_name` - repository name on Hugging Face Hub, e.g., `lm-eval-results`,
* `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
* `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
* `public_repo` - whether the repository is public, can be `True` or `False`,
* `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`.
* `point_of_contact` - Point of contact for the results dataset, e.g., `yourname@example.com`.
## External Library Usage
......
......@@ -277,13 +277,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if (
"push_results_to_hub" in evaluation_tracker_args
or "push_samples_to_hub" in evaluation_tracker_args
) and "hub_results_org" not in evaluation_tracker_args:
raise ValueError(
"If push_results_to_hub or push_samples_to_hub is set, results_org must be specified."
)
if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
eval_logger.warning(
"Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
......@@ -402,6 +395,12 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
task_name=task_name, samples=samples[task_name]
)
if (
evaluation_tracker.push_results_to_hub
or evaluation_tracker.push_samples_to_hub
):
evaluation_tracker.recreate_metadata_card()
print(
f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
......
import json
import re
import time
from collections import defaultdict
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from huggingface_hub import HfApi
from datasets import load_dataset
from datasets.utils.metadata import MetadataConfigs
from huggingface_hub import (
DatasetCard,
DatasetCardData,
HfApi,
hf_hub_url,
)
from lm_eval.utils import (
eval_logger,
handle_non_serializable,
hash_string,
sanitize_list,
)
......@@ -88,31 +97,53 @@ class EvaluationTracker:
push_samples_to_hub: bool = False,
public_repo: bool = False,
token: str = "",
leaderboard_url: str = "",
point_of_contact: str = "",
) -> None:
"""
Creates all the necessary loggers for evaluation tracking.
Args:
output_path (str): Path to save the results. If not provided, the results won't be saved.
hub_results_org (str): The Hugging Face organisation to push the results to. If not provided, the results won't be pushed.
hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
public_repo (bool): Whether to push the results to a public or private repository.
token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
point_of_contact (str): Contact information on the Hugging Face hub dataset card.
"""
self.general_config_tracker = GeneralConfigTracker()
self.output_path = output_path
self.hub_results_org = hub_results_org
hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
self.push_results_to_hub = push_results_to_hub
self.push_samples_to_hub = push_samples_to_hub
self.public_repo = public_repo
self.leaderboard_url = leaderboard_url
self.point_of_contact = point_of_contact
self.api = HfApi(token=token) if token else None
if not self.api and (push_results_to_hub or push_samples_to_hub):
raise ValueError(
"Hugging Face token is not defined, but 'push_results_to_hub' or 'push_samples_to_hub' is set to True. "
"Please provide a valid Hugging Face token by setting the HF_TOKEN environment variable."
)
if (
self.api
and hub_results_org == ""
and (push_results_to_hub or push_samples_to_hub)
):
hub_results_org = self.api.whoami()["name"]
eval_logger.warning(
f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
)
hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
def save_results_aggregated(
self,
results: dict,
......@@ -160,23 +191,28 @@ class EvaluationTracker:
file_results_aggregated.open("w", encoding="utf-8").write(dumped)
if self.api and self.push_results_to_hub:
self.api.create_repo(
repo_id=self.hub_results_repo
repo_id = (
self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
else self.hub_results_repo_private
)
self.api.create_repo(
repo_id=repo_id,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
repo_id=repo_id,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
repo_type="dataset",
commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
)
eval_logger.info(
"Successfully pushed aggregated results to the Hugging Face Hub. "
f"You can find them at: {repo_id}"
)
except Exception as e:
eval_logger.warning("Could not save results aggregated")
......@@ -200,13 +236,7 @@ class EvaluationTracker:
"""
if self.output_path:
try:
eval_logger.info("Saving samples results")
samples_dumped = json.dumps(
samples,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
eval_logger.info(f"Saving per-sample results for: {task_name}")
path = Path(self.output_path if self.output_path else Path.cwd())
path = path.joinpath(self.general_config_tracker.model_name_sanitized)
......@@ -215,29 +245,268 @@ class EvaluationTracker:
file_results_samples = path.joinpath(
f"samples_{task_name}_{self.date_id}.json"
)
file_results_samples.write_text(samples_dumped, encoding="utf-8")
for sample in samples:
# we first need to sanitize arguments and resps
# otherwise we won't be able to load the dataset
# using the datasets library
arguments = {}
for i, arg in enumerate(sample["arguments"]):
arguments[f"gen_args_{i}"] = {}
for j, tmp in enumerate(arg):
arguments[f"gen_args_{i}"][f"arg_{j}"] = tmp
sample["resps"] = sanitize_list(sample["resps"])
sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
sample["arguments"] = arguments
sample_dump = (
json.dumps(
sample,
default=handle_non_serializable,
ensure_ascii=False,
)
+ "\n"
)
with open(file_results_samples, "a") as f:
f.write(sample_dump)
if self.api and self.push_samples_to_hub:
self.api.create_repo(
repo_id = (
self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
else self.hub_results_repo_private
)
self.api.create_repo(
repo_id=repo_id,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
repo_id=repo_id,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
repo_type="dataset",
commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
)
eval_logger.info(
f"Successfully pushed sample results for task: {task_name} to the Hugging Face Hub. "
f"You can find them at: {repo_id}"
)
except Exception as e:
eval_logger.warning("Could not save sample results")
eval_logger.info(repr(e))
else:
eval_logger.info("Output path not provided, skipping saving sample results")
def recreate_metadata_card(self) -> None:
"""
Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
"""
def get_file_task_name(filename: str) -> str:
return filename[filename.find("_") + 1 : filename.rfind("_")]
def get_file_datetime(filename: str) -> str:
return filename[filename.rfind("_") + 1 :].replace(".json", "")
def sanitize_task_name(task_name: str) -> str:
return re.sub(r"\W", "_", task_name)
eval_logger.info("Recreating metadata card")
repo_id = (
self.hub_results_repo if self.public_repo else self.hub_results_repo_private
)
files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
results_files = [f for f in files_in_repo if "/results_" in f and ".json" in f]
sample_files = [f for f in files_in_repo if "/samples_" in f and ".json" in f]
# Build a dictionary to store the latest evaluation datetime for:
# - Each tested model and its aggregated results
# - Each task and sample results, if existing
# i.e. {
# "org__model_name__gsm8k": "2021-09-01T12:00:00",
# "org__model_name__ifeval": "2021-09-01T12:00:00",
# "org__model_name__results": "2021-09-01T12:00:00"
# }
latest_task_results_datetime = defaultdict(lambda: datetime.min.isoformat())
for file_path in sample_files:
file_path = Path(file_path)
filename = file_path.name
model_name = file_path.parent
task_name = get_file_task_name(filename)
results_datetime = get_file_datetime(filename)
task_name_sanitized = sanitize_task_name(task_name)
# Results and sample results for the same model and task will have the same datetime
samples_key = f"{model_name}__{task_name_sanitized}"
results_key = f"{model_name}__results"
latest_datetime = max(
latest_task_results_datetime[samples_key],
results_datetime,
)
latest_task_results_datetime[samples_key] = latest_datetime
latest_task_results_datetime[results_key] = latest_datetime
# Create metadata card
card_metadata = MetadataConfigs()
# Add the latest aggregated results to the metadata card for easy access
for file_path in results_files:
file_path = Path(file_path)
results_filename = file_path.name
model_name = file_path.parent
eval_date = get_file_datetime(results_filename)
eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
results_filename = Path("**") / Path(results_filename).name
config_name = f"{model_name}__results"
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results:
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
# Add the tasks details configs
for file_path in sample_files:
file_path = Path(file_path)
filename = file_path.name
model_name = file_path.parent
task_name = get_file_task_name(filename)
eval_date = get_file_datetime(filename)
task_name_sanitized = sanitize_task_name(task_name)
eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
results_filename = Path("**") / Path(filename).name
config_name = f"{model_name}__{task_name_sanitized}"
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results:
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
# Special case for MMLU with a single split covering it all
# We add another config with all MMLU splits results together for easy inspection
SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
for special_task in SPECIAL_TASKS:
if special_task in config_name:
special_task = f"{model_name}__{special_task}"
former_entry = card_metadata.get(special_task, {"data_files": []})
former_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == eval_date_sanitized
]
if len(former_split) == 0:
former_entry["data_files"].append(
{
"split": eval_date_sanitized,
"path": [str(results_filename)],
}
)
else:
split_index, _ = former_split[0]
former_entry["data_files"][split_index]["path"].append(
str(results_filename)
)
if eval_date_sanitized == sanitized_last_eval_date_results:
latest_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == "latest"
]
if len(latest_split) == 0:
former_entry["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
else:
latest_index, _ = latest_split[0]
former_entry["data_files"][latest_index]["path"].append(
str(results_filename)
)
card_metadata[special_task] = former_entry
# Get latest results and extract info to update metadata card examples
latest_datetime = max(latest_task_results_datetime.values())
latest_model_name = max(
latest_task_results_datetime, key=lambda k: latest_task_results_datetime[k]
)
last_results_file = [
f for f in results_files if latest_datetime.replace(":", "-") in f
][0]
last_results_file_path = hf_hub_url(
repo_id=repo_id, filename=last_results_file, repo_type="dataset"
)
latest_results_file = load_dataset(
"json", data_files=last_results_file_path, split="train"
)
results_dict = latest_results_file["results"][0]
new_dictionary = {"all": results_dict}
new_dictionary.update(results_dict)
results_string = json.dumps(new_dictionary, indent=4)
dataset_summary = (
"Dataset automatically created during the evaluation run of model "
)
if self.general_config_tracker.model_source == "hf":
dataset_summary += f"[{self.general_config_tracker.model_name}](https://huggingface.co/{self.general_config_tracker.model_name})\n"
else:
dataset_summary += f"{self.general_config_tracker.model_name}\n"
dataset_summary += (
f"The dataset is composed of {len(card_metadata)-1} configuration(s), each one corresponding to one of the evaluated task.\n\n"
f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n'
'An additional configuration "results" store all the aggregated results of the run.\n\n'
"To load the details from a run, you can for instance do the following:\n"
)
if self.general_config_tracker.model_source == "hf":
dataset_summary += (
"```python\nfrom datasets import load_dataset\n"
f'data = load_dataset(\n\t"{repo_id}",\n\tname="{latest_model_name}",\n\tsplit="latest"\n)\n```\n\n'
)
dataset_summary += (
"## Latest results\n\n"
f'These are the [latest results from run {latest_datetime}]({last_results_file_path.replace("/resolve/", "/blob/")}) '
"(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
'You find each in the results and the "latest" split for each eval):\n\n'
f"```python\n{results_string}\n```"
)
card_data = DatasetCardData(
dataset_summary=dataset_summary,
repo_url=f"https://huggingface.co/{self.general_config_tracker.model_name}",
pretty_name=f"Evaluation run of {self.general_config_tracker.model_name}",
leaderboard_url=self.leaderboard_url,
point_of_contact=self.point_of_contact,
)
card_metadata.to_dataset_card_data(card_data)
card = DatasetCard.from_template(
card_data,
pretty_name=card_data.pretty_name,
)
card.push_to_hub(repo_id, repo_type="dataset")
......@@ -81,6 +81,18 @@ def handle_non_serializable(o):
return str(o)
def sanitize_list(sub):
"""
Takes possible nested list and recursively converts all inner component to strings
"""
if isinstance(sub, list):
return [sanitize_list(item) for item in sub]
if isinstance(sub, tuple):
return tuple(sanitize_list(item) for item in sub)
else:
return str(sub)
def simple_parse_args_string(args_string):
"""
Parses something like
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment