Commit a6b358ca authored by Rayyyyy's avatar Rayyyyy
Browse files

update version

parent ed53d51c
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: Address all TODOs and remove all explanatory comments
"""QuAC dataset."""
import json
import datasets
_CITATION = """\
@article{choi2018quac,
title={Quac: Question answering in context},
author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
journal={arXiv preprint arXiv:1808.07036},
year={2018}
}
"""
_DESCRIPTION = """\
Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
participating in information seeking dialog. Data instances consist of an interactive
dialog between two crowd workers: (1) a student who poses a sequence of freeform
questions to learn as much as possible about a hidden Wikipedia text, and (2)
a teacher who answers the questions by providing short excerpts (spans) from the text.
"""
_HOMEPAGE = "https://quac.ai/"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = {
"train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json",
"validation": "https://s3.amazonaws.com/my89public/quac/val_v0.2.json",
}
class Quac(datasets.GeneratorBasedBuilder):
"""Question Answering in Context (QuAC) is a dataset for modeling, understanding, and participating in information seeking dialog."""
VERSION = datasets.Version("1.1.0")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="quac", version=VERSION, description="The QuAC dataset"
),
]
def _info(self):
features = datasets.Features(
{
"title": datasets.Value("string"),
"section_title": datasets.Value("string"),
"paragraph": datasets.Value("string"),
"question": datasets.Value("string"),
"answer": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir["train"],
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
with open(filepath, encoding="utf-8") as f:
data = json.load(f)["data"]
key = 0
for row in data:
paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
qas = row["paragraphs"][0]["qas"]
qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
for (question, answer) in qa_pairs:
# Yields examples as (key, example) tuples
yield key, {
"title": row["title"],
"section_title": row["section_title"],
"paragraph": paragraph,
"question": question,
"answer": answer,
}
key += 1
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""SAT Analogy Questions dataset."""
import os
import datasets
_CITATION = """\
@article{article,
author = {Turney, Peter},
year = {2006},
month = {09},
pages = {379-416},
title = {Similarity of Semantic Relations},
volume = {32},
journal = {Computational Linguistics},
doi = {10.1162/coli.2006.32.3.379}
}
"""
_DESCRIPTION = """\
SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
multiple-choice analogy questions; 5 choices per question.
"""
_HOMEPAGE = "https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
class SatAnalogies(datasets.GeneratorBasedBuilder):
"""SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="sat_analogies",
version=VERSION,
description="The SAT Analogy Questions dataset",
),
]
@property
def manual_download_instructions(self):
return (
"To use SAT Analogy Questions you have to download it manually. Please "
"email Peter Turney to request the data (https://www.apperceptual.com). "
"Once you receive a download link for the dataset, supply the local path "
"as the `data_dir` arg: "
"`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
)
def _info(self):
features = datasets.Features(
{
"source": datasets.Value("string"),
"stem": datasets.Value("string"),
"choices": datasets.features.Sequence(datasets.Value("string")),
"solution": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
if not os.path.exists(data_dir):
raise FileNotFoundError(
f"{data_dir} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('matinf', data_dir=...)` that includes SAT-package-V3.txt. Manual download instructions: {self.manual_download_instructions}"
)
return [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(data_dir, "SAT-package-V3.txt"),
},
)
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath):
data = []
with open(filepath, "r", encoding="utf-8") as f:
record = []
for line in f:
line = line.strip()
if len(line) == 0 and record:
data.append(record)
record = []
elif len(line) > 0 and line[0] == "#":
# Skip comments.
continue
else:
record.append(line)
data.append(record)
for key, record in enumerate(data):
source = record[-8]
stem = record[-7]
choices = record[-6:-1]
solution = record[-1]
yield key, {
"source": source,
"stem": stem,
"choices": choices,
"solution": solution,
}
{"mid_word_1_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n pages = {1877--1901},\n publisher = {Curran Associates, Inc.},\n title = {Language Models are Few-Shot Learners},\n url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n volume = {33},\n year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_1_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 271516, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_1_anagrams.jsonl.gz": {"num_bytes": 106533, "checksum": "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"}}, "download_size": 106533, "post_processing_size": null, "dataset_size": 271516, "size_in_bytes": 378049}, "mid_word_2_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n pages = {1877--1901},\n publisher = {Curran Associates, Inc.},\n title = {Language Models are Few-Shot Learners},\n url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n volume = {33},\n year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_2_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_2_anagrams.jsonl.gz": {"num_bytes": 109091, "checksum": "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"}}, "download_size": 109091, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 391745}, "cycle_letters_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n pages = {1877--1901},\n publisher = {Curran Associates, Inc.},\n title = {Language Models are Few-Shot Learners},\n url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n volume = {33},\n year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "cycle_letters_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/cycle_letters_in_word.jsonl.gz": {"num_bytes": 98451, "checksum": "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"}}, "download_size": 98451, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 381105}, "random_insertion_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n pages = {1877--1901},\n publisher = {Curran Associates, Inc.},\n title = {Language Models are Few-Shot Learners},\n url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n volume = {33},\n year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "random_insertion_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 353981, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/random_insertion_in_word.jsonl.gz": {"num_bytes": 143626, "checksum": "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"}}, "download_size": 143626, "post_processing_size": null, "dataset_size": 353981, "size_in_bytes": 497607}, "reversed_words": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n pages = {1877--1901},\n publisher = {Curran Associates, Inc.},\n title = {Language Models are Few-Shot Learners},\n url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n volume = {33},\n year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "reversed_words", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/reversed_words.jsonl.gz": {"num_bytes": 91917, "checksum": "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"}}, "download_size": 91917, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 374571}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Unscramble dataset."""
import json
import os
import datasets
_CITATION = """\
@inproceedings{NEURIPS2020_1457c0d6,
author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
pages = {1877--1901},
publisher = {Curran Associates, Inc.},
title = {Language Models are Few-Shot Learners},
url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
volume = {33},
year = {2020}
}
"""
_DESCRIPTION = """\
Unscramble is a small battery of 5 “character manipulation” tasks. Each task
involves giving the model a word distorted by some combination of scrambling,
addition, or deletion of characters, and asking it to recover the original word.
"""
_HOMEPAGE = "https://github.com/openai/gpt-3/tree/master/data"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_BASE_URL = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
_DESCRIPTIONS = {
"mid_word_1_anagrams": "Anagrams of all but the first and last letter.",
"mid_word_2_anagrams": "Anagrams of all but the first and last 2 letters.",
"cycle_letters_in_word": "Cycle letters in the word.",
"random_insertion_in_word": "Random insertions in the word that must be removed.",
"reversed_words": "Words spelled backwards that must be reversed.",
}
_NAMES = _DESCRIPTIONS.keys()
class Unscramble(datasets.GeneratorBasedBuilder):
"""Unscramble is a small battery of 5 “character manipulation” tasks."""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name=name, version=version, description=_DESCRIPTIONS[name]
)
for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
]
def _info(self):
features = datasets.Features(
{
"context": datasets.Value("string"),
"completion": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = os.path.join(_BASE_URL, f"{self.config.name}.jsonl.gz")
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir,
"split": "validation",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
with open(filepath, encoding="utf-8") as f:
for key, row in enumerate(f):
data = json.loads(row)
yield key, {
"context": data["context"],
"completion": data["completion"],
}
import os
import zstandard
import json
import jsonlines
import io
import datetime
import io
import json
import mmap
import tqdm
import os
from pathlib import Path
from typing import Any
import jsonlines
import tqdm
import zstandard
def json_serial(obj):
def json_serial(obj: Any) -> str:
"""JSON serializer for objects not serializable by default json code"""
if isinstance(obj, (datetime.datetime,)):
......@@ -19,7 +21,7 @@ def json_serial(obj):
# Modified version of lm_dataformat Archive for single file.
class Archive:
def __init__(self, file_path, compression_level=3):
def __init__(self, file_path: str, compression_level: int = 3) -> None:
self.file_path = file_path
dir_name = os.path.dirname(file_path)
if dir_name:
......@@ -28,7 +30,9 @@ class Archive:
self.cctx = zstandard.ZstdCompressor(level=compression_level)
self.compressor = self.cctx.stream_writer(self.fh)
def add_data(self, data, meta={}):
def add_data(self, data, meta=None) -> None:
if meta is None:
meta = {}
self.compressor.write(
json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
"UTF-8"
......@@ -36,7 +40,7 @@ class Archive:
+ b"\n"
)
def commit(self):
def commit(self) -> None:
self.compressor.flush(zstandard.FLUSH_FRAME)
self.fh.flush()
self.fh.close()
......@@ -44,10 +48,16 @@ class Archive:
# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
class Reader:
def __init__(self):
def __init__(self) -> None:
pass
def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"):
def read(
self,
file,
get_meta: bool = False,
autojoin_paragraphs: bool = True,
para_joiner: str = "\n\n",
):
with open(file, "rb") as fh:
self.fh = fh
cctx = zstandard.ZstdDecompressor()
......@@ -72,7 +82,7 @@ class Reader:
class TextArchive:
def __init__(self, file_path, mode="rb+"):
def __init__(self, file_path, mode: str = "rb+") -> None:
self.file_path = file_path
dir_name = os.path.dirname(file_path)
if dir_name:
......@@ -83,24 +93,24 @@ class TextArchive:
self.fh = open(self.file_path, mode)
def add_data(self, data):
def add_data(self, data) -> None:
self.fh.write(data.encode("UTF-8") + b"\n")
def commit(self):
def commit(self) -> None:
self.fh.flush()
self.fh.close()
class TextReader:
def __init__(self, file_path):
def __init__(self, file_path) -> None:
self.file_path = file_path
# Optimized mmap read with infrequent tqdm updates to maintain speed
# Tested up to 250MB/s.
def read_tqdm(self, update_frequency=10000):
def read_tqdm(self, update_frequency: int = 10000):
current_file_position = 0
line_counter = 0
with open(self.file_path, "r") as fh, tqdm.tqdm(
with open(self.file_path, "r", encoding="utf-8") as fh, tqdm.tqdm(
total=os.path.getsize(self.file_path),
dynamic_ncols=True,
unit="byte",
......@@ -149,7 +159,7 @@ class TextReader:
# Optimized for speed. Decompresses the archive in shell before
# using the mmap'd TextReader.
class ZStdTextReader:
def __init__(self, file):
def __init__(self, file) -> None:
self.file = file
def read_tqdm(self):
......
import time
import random
import pickle
import json
import collections
import glob
import json
import os
import collections
import pickle
import random
import time
from .janitor import Janitor, word_ngrams
from .archiver import ZStdTextReader
from .janitor import Janitor, word_ngrams
# Was used for testing the evaluator decoupled from the full logic below
def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
def get_train_overlap_stub(docs: dict, ngrams_path: str, ngrams_n_size: str):
simulated_overlap = 0.1
contaminated = int(len(docs) * simulated_overlap)
return random.sample(range(len(docs)), contaminated)
......@@ -25,6 +25,7 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
# scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
# files. These should exist in the "ngrams_path" provided to this function.
# Algorithm:
# 1. Build lookups for each dataset {ngram: list(document_ids)}
# 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
......@@ -33,11 +34,11 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
# 4. Strip the task_set from the dictionary keys and return
#
# We cache the task+set lookups as well as the overlaps.
def get_train_overlap(docs_by_task_set, ngrams_path, limit):
def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> dict:
# return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)
info_dict_path = os.path.join(ngrams_path, "info.json")
info_dict = json.load(open(info_dict_path, "r"))
info_dict = json.load(open(info_dict_path, "r", encoding="utf-8"))
ngrams_n_size = info_dict["ngram_size"]
janitor = Janitor()
......@@ -46,7 +47,7 @@ def get_train_overlap(docs_by_task_set, ngrams_path, limit):
print("Building Lookups...")
start = time.perf_counter()
def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"
lookups = {}
......@@ -108,7 +109,7 @@ def get_train_overlap(docs_by_task_set, ngrams_path, limit):
print(f"Merging lookups took {elapsed:0.5f} seconds.")
print(f"{ngrams_n_size} grams files found in {ngrams_path}:")
files = glob.glob(os.path.join(ngrams_path, f"*.sorted.zst"))
files = glob.glob(os.path.join(ngrams_path, "*.sorted.zst"))
print(files)
for file in files:
......@@ -134,11 +135,7 @@ def get_train_overlap(docs_by_task_set, ngrams_path, limit):
matching_unique += 1
for task_name, task_set, doc_ids in merged_lookup[ngram]:
task_doc_set = duplicates[(task_name, task_set)]
for (
doc_id
) in (
doc_ids
): # Record contamination across all relevant task/set combos
for doc_id in doc_ids: # Record contamination across all relevant task/set combos
task_doc_set.add(doc_id)
del merged_lookup[ngram] # No point matching again
else:
......
import pickle
import re
import string
import timeit
import pickle
import traceback
from pprint import pprint
from typing import Iterator, List, Sequence, Tuple, TypeVar
# This is a cpp module. Compile janitor_util.cpp with:
# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
......@@ -16,10 +16,12 @@ except Exception:
traceback.print_exc()
JANITOR_CPP = False
T = TypeVar("T")
# Implementation from nltk source
# https://www.nltk.org/_modules/nltk/util.html
def form_ngrams(sequence, n):
def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
history = []
while n > 1:
# PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
......@@ -36,7 +38,7 @@ def form_ngrams(sequence, n):
del history[0]
def word_ngrams(s, n):
def word_ngrams(s: str, n: int) -> Iterator[str]:
"""Splits a string into ngram words"""
tokens = s.split() # not a generator :(
ngram_seqs = form_ngrams(iter(tokens), n)
......@@ -68,14 +70,14 @@ def word_ngrams(s, n):
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def split_indices(s):
def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
"""Splits a string on whitespaces and records the indices of each in the original string.
@:return generator((word, (start_idx, end_idx)), ...)
"""
return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
def word_ngrams_indices(s, n):
def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
"""Splits a string into pairs of (ngram words, their start/end indices)"""
tokens_with_indices = split_indices(s)
......@@ -104,16 +106,15 @@ def word_ngrams_indices(s, n):
class Janitor:
# FIXME delete_chars: Should anything else go here? Special chars?
def __init__(
self,
ngram_n=13,
window_to_remove=200,
too_dirty_cutoff=10,
minimum_slice_length=200,
delete_chars=string.punctuation,
):
ngram_n: int = 13,
window_to_remove: int = 200,
too_dirty_cutoff: int = 10,
minimum_slice_length: int = 200,
delete_chars: str = string.punctuation,
) -> None:
self.ngram_n = ngram_n
self.window_to_remove = window_to_remove
self.too_dirty_cutoff = too_dirty_cutoff
......@@ -135,11 +136,11 @@ class Janitor:
# I/O for saving contamination ngrams
##############
def save_contamination_ngrams(self, filename):
def save_contamination_ngrams(self, filename: str) -> None:
with open(filename, "wb") as fp:
pickle.dump(filename, fp)
def load_contamination_ngrams(self, filename):
def load_contamination_ngrams(self, filename: str) -> None:
with open(filename, "rb") as fp:
self.dirt_ngrams = pickle.load(fp)
......@@ -147,7 +148,7 @@ class Janitor:
# Call these :)
##############
def register_contaminant(self, dirt_string):
def register_contaminant(self, dirt_string: str) -> None:
"""Register a string as contamination to be removed, e.g. a test set
This breaks the dirt_string into ngrams to store for future cleaning"""
if JANITOR_CPP:
......@@ -156,7 +157,7 @@ class Janitor:
print("WARNING: Janitor running in python mode")
return self.register_contaminant_python(dirt_string)
def clean(self, dirty_string):
def clean(self, dirty_string: str) -> List[str]:
"""Clean a string (e.g. a training set) by removing all ngrams previously
registered as contaminants. Returns a list of clean chunks, or empty if
the string was too dirty"""
......@@ -166,7 +167,9 @@ class Janitor:
print("WARNING: Janitor running in python mode")
return self.clean_python(dirty_string)
def _split_chunks(self, dirty_string, dirty_parts):
def _split_chunks(
self, dirty_string: str, dirty_parts: Sequence[Tuple]
) -> List[str]:
clean_chunks = []
splice_idx = 0
end = -1
......@@ -189,12 +192,12 @@ class Janitor:
# Fast C++
##############
def register_contaminant_cpp(self, dirt_string):
def register_contaminant_cpp(self, dirt_string) -> None:
self.dirt_ngrams.update(
janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
)
def clean_cpp(self, dirty_string):
def clean_cpp(self, dirty_string: str) -> List[str]:
contamination_indices = janitor_util.clean_ngram_with_indices(
dirty_string, self.delete_chars, self.ngram_n
)
......@@ -204,15 +207,15 @@ class Janitor:
# Slow python
##############
def normalize_string(self, s):
def normalize_string(self, s: str) -> str:
return s.translate(self.translation_table)
def register_contaminant_python(self, dirt_string):
def register_contaminant_python(self, dirt_string: str) -> None:
self.dirt_ngrams.update(
word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
)
def clean_python(self, dirty_string):
def clean_python(self, dirty_string: str) -> List[str]:
contamination_indices = (
(None, *idx_pair)
for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
......
import collections
import itertools
import numpy as np
import json
import logging
import random
import lm_eval.metrics
import time
from collections import defaultdict
from typing import TYPE_CHECKING, List, Optional, Union
import numpy as np
import torch
import lm_eval.api.metrics
import lm_eval.api.registry
import lm_eval.models
import lm_eval.tasks
import lm_eval.base
from lm_eval.utils import positional_deprecated, run_task_tests
from lm_eval.caching.cache import delete_cache
from lm_eval.evaluator_utils import (
consolidate_results,
get_sample_size,
get_task_list,
prepare_print_tasks,
print_writeout,
run_task_tests,
)
from lm_eval.logging.utils import add_env_info, get_git_commit_hash
from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.utils import (
eval_logger,
handle_non_serializable,
hash_string,
positional_deprecated,
simple_parse_args_string,
)
if TYPE_CHECKING:
from lm_eval.api.model import LM
from lm_eval.tasks import Task
@positional_deprecated
def simple_evaluate(
model,
model_args=None,
tasks=[],
num_fewshot=0,
batch_size=None,
max_batch_size=None,
device=None,
no_cache=False,
limit=None,
bootstrap_iters=100000,
description_dict=None,
check_integrity=False,
decontamination_ngrams_path=None,
write_out=False,
output_base_path=None,
log_samples=True,
gen_kwargs=None,
model_args: Optional[Union[str, dict]] = None,
tasks: Optional[List[Union[str, dict, object]]] = None,
num_fewshot: Optional[int] = None,
batch_size: Optional[int] = None,
max_batch_size: Optional[int] = None,
device: Optional[str] = None,
use_cache: Optional[str] = None,
cache_requests: bool = False,
rewrite_requests_cache: bool = False,
delete_requests_cache: bool = False,
limit: Optional[Union[int, float]] = None,
bootstrap_iters: int = 100000,
check_integrity: bool = False,
write_out: bool = False,
log_samples: bool = True,
gen_kwargs: Optional[str] = None,
task_manager: Optional[TaskManager] = None,
verbosity: str = "INFO",
predict_only: bool = False,
random_seed: int = 0,
numpy_random_seed: int = 1234,
torch_random_seed: int = 1234,
fewshot_random_seed: int = 1234,
):
"""Instantiate and evaluate a model on a list of tasks.
:param model: Union[str, LM]
Name of model or LM object, see lm_eval.models.get_model
:param model_args: Optional[str]
String arguments for each model class, see LM.create_from_arg_string.
:param model_args: Optional[str, dict]
String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object.
Ignored if `model` argument is a LM object.
:param tasks: list[Union[str, Task]]
:param tasks: list[Union[str, dict, Task]]
List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
:param num_fewshot: int
Number of examples in few-shot context
......@@ -46,49 +81,174 @@ def simple_evaluate(
Maximal batch size to try with automatic batch size detection
:param device: str, optional
PyTorch device (e.g. "cpu" or "cuda:0") for running models
:param no_cache: bool
Whether or not to cache
:param use_cache: str, optional
A path to a sqlite db file for caching model responses. `None` if not caching.
:param cache_requests: bool, optional
Speed up evaluation by caching the building of dataset requests. `None` if not caching.
:param rewrite_requests_cache: bool, optional
Rewrites all of the request cache if set to `True`. `None` if not desired.
:param delete_requests_cache: bool, optional
Deletes all of the request cache if set to `True`. `None` if not desired.
:param limit: int or float, optional
Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:param description_dict: dict[str, str]
Dictionary of custom task descriptions of the form: `task_name: description`
:param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks
:param write_out: bool
If True, write details about prompts and logits to json for all tasks
:param output_base_path: str, optional
Directory to which detailed eval info will be written. Defaults to present working dir.
If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param gen_kwargs: str
String arguments for model generation
Ignored for all tasks with loglikelihood output_type
:param predict_only: bool
If true only model outputs will be generated and returned. Metrics will not be evaluated
:param random_seed: int
Random seed for python's random module. If set to None, the seed will not be set.
:param numpy_random_seed: int
Random seed for numpy. If set to None, the seed will not be set.
:param torch_random_seed: int
Random seed for torch. If set to None, the seed will not be set.
:param fewshot_random_seed: int
Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
:return
Dictionary of results
"""
random.seed(1234)
np.random.seed(1234)
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
start_date = time.time()
if delete_requests_cache:
eval_logger.info("Deleting requests cache...")
delete_cache()
seed_message = []
if random_seed is not None:
# See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
seed_message.append(f"Setting random seed to {random_seed}")
random.seed(random_seed)
if numpy_random_seed is not None:
seed_message.append(f"Setting numpy seed to {numpy_random_seed}")
np.random.seed(numpy_random_seed)
if torch_random_seed is not None:
seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
torch.manual_seed(torch_random_seed)
if seed_message:
eval_logger.info(" | ".join(seed_message))
if tasks is None:
tasks = []
if len(tasks) == 0:
raise ValueError(
"No tasks specified, or no tasks found. Please verify the task names."
)
assert tasks != [], "No tasks specified"
if gen_kwargs is not None:
gen_kwargs = simple_parse_args_string(gen_kwargs)
eval_logger.warning(
"generation_kwargs specified through cli, these settings will update set parameters in yaml tasks. "
"Ensure 'do_sample=True' for non-greedy decoding!"
)
if gen_kwargs == "":
gen_kwargs = None
if isinstance(model, str):
if model_args is None:
eval_logger.warning("model_args not specified. Using defaults.")
model_args = ""
lm = lm_eval.models.get_model(model).create_from_arg_string(
model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device}
if isinstance(model_args, dict):
eval_logger.info(
f"Initializing {model} model, with arguments: {model_args}"
)
lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
model_args,
{
"batch_size": batch_size,
"max_batch_size": max_batch_size,
"device": device,
},
)
else:
assert isinstance(model, lm_eval.base.LM)
eval_logger.info(
f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
)
lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
model_args,
{
"batch_size": batch_size,
"max_batch_size": max_batch_size,
"device": device,
},
)
else:
if not isinstance(model, lm_eval.api.model.LM):
raise TypeError
eval_logger.info("Using pre-initialized model")
lm = model
if not no_cache:
lm = lm_eval.base.CachingLM(
if use_cache is not None:
eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
lm = lm_eval.api.model.CachingLM(
lm,
"lm_cache/"
+ (model if isinstance(model, str) else model.model.config._name_or_path)
+ "_"
+ model_args.replace("=", "-").replace(",", "_").replace("/", "-")
use_cache
# each rank receives a different cache db.
# necessary to avoid multiple writes to cache at once
+ "_rank"
+ str(lm.rank)
+ ".db",
)
task_dict = lm_eval.tasks.get_task_dict(tasks)
if task_manager is None:
task_manager = TaskManager(verbosity)
task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
if isinstance(task_obj, tuple):
_, task_obj = task_obj
if task_obj is None:
continue
if task_obj.get_config("output_type") == "generate_until":
if gen_kwargs is not None:
task_obj.set_config(
key="generation_kwargs", value=gen_kwargs, update=True
)
if predict_only:
log_samples = True
eval_logger.info(
f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
)
# we have to change the class properties post-hoc. This is pretty hacky.
task_obj.override_metric(metric_name="bypass")
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
else:
eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
eval_logger.info(
f"Setting fewshot random generator seed to {fewshot_random_seed}"
)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
task_obj.set_config(key="num_fewshot", value=0)
if check_integrity:
run_task_tests(task_list=tasks)
......@@ -96,316 +256,366 @@ def simple_evaluate(
results = evaluate(
lm=lm,
task_dict=task_dict,
num_fewshot=num_fewshot,
limit=limit,
cache_requests=cache_requests,
rewrite_requests_cache=rewrite_requests_cache,
bootstrap_iters=bootstrap_iters,
description_dict=description_dict,
decontamination_ngrams_path=decontamination_ngrams_path,
write_out=write_out,
output_base_path=output_base_path,
log_samples=log_samples,
verbosity=verbosity,
)
if lm.rank == 0:
if isinstance(model, str):
model_name = model
elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
model_name = model.config._name_or_path
else:
model_name = type(model).__name__
# add info about the model and few shot config
results["config"] = {
"model": (model if isinstance(model, str) else model.model.config._name_or_path),
"model": model_name,
"model_args": model_args,
"num_fewshot": num_fewshot,
}
# add more detailed model info if available
if isinstance(lm, lm_eval.models.huggingface.HFLM):
results["config"].update(lm.get_model_info())
# add info about execution
results["config"].update(
{
"batch_size": batch_size,
"batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [],
"batch_sizes": (
list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
),
"device": device,
"no_cache": no_cache,
"use_cache": use_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters,
"description_dict": description_dict,
"gen_kwargs": gen_kwargs,
"random_seed": random_seed,
"numpy_seed": numpy_random_seed,
"torch_seed": torch_random_seed,
"fewshot_seed": fewshot_random_seed,
}
)
results["git_hash"] = get_git_commit_hash()
results["date"] = start_date
add_env_info(results) # additional environment info to results
return results
decontaminate_suffix = "_decontaminate"
else:
return None
@positional_deprecated
def evaluate(
lm,
lm: "LM",
task_dict,
provide_description=None,
num_fewshot=0,
limit=None,
bootstrap_iters=100000,
description_dict=None,
decontamination_ngrams_path=None,
write_out=False,
output_base_path=None,
limit: Optional[int] = None,
cache_requests: bool = False,
rewrite_requests_cache: bool = False,
bootstrap_iters: Optional[int] = 100000,
write_out: bool = False,
log_samples: bool = True,
verbosity: str = "INFO",
):
"""Instantiate and evaluate a model on a list of tasks.
:param lm: obj
Language Model
:param task_dict: dict[str, Task]
Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
:param provide_description: bool
Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
:param num_fewshot: int
Number of examples in few-shot context
Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:param description_dict: dict[str, str]
Dictionary of custom task descriptions of the form: `task_name: description`
:param write_out: bool
If True, write all prompts, logits and metrics to json for offline analysis
:param output_base_path: str, optional
Directory to which detailed eval info will be written. Defaults to present working dir
If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:return
Dictionary of results
"""
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
# TODO: todo: implement proper description-providing system
assert not provide_description # not implemented.
if provide_description is not None:
# nudge people to not specify it at all
print(
"WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
)
decontaminate = decontamination_ngrams_path is not None
task_dict_items = [
(name, task)
for name, task in task_dict.items()
if (task.has_validation_docs() or task.has_test_docs())
]
results = collections.defaultdict(dict)
versions = collections.defaultdict(dict)
requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list)
overlaps = collections.defaultdict(list) # {task_name: contaminated_docs}
# If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
# memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
# over-engineering is bad (or we could make it write the requests to disk and then read them back out again
# - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {}
write_out_info = {}
docs_for_decontamination = collections.defaultdict(list)
# get lists of each type of request
for task_name, task in task_dict_items:
versions[task_name] = task.VERSION
# default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if task.has_test_docs():
task_doc_func = task.test_docs
task_set = "test" # Required for caching in the decontamination
elif task.has_validation_docs():
task_set = "val" # Required for caching in the decontamination
task_doc_func = task.validation_docs
else:
raise RuntimeError("Task has neither test_docs nor validation_docs")
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
task_docs = list(task_doc_func())
rnd = random.Random()
rnd.seed(42)
rnd.shuffle(task_docs)
print(f"Task: {task_name}; number of docs: {len(task_docs)}")
if write_out:
prompt_details = []
description = (
description_dict[task_name]
if description_dict and task_name in description_dict
else ""
)
if limit is not None:
limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
if decontaminate and task.should_decontaminate():
docs_for_decontamination[(task_name, task_set)].append(
task.doc_to_decontamination_query(doc)
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
# tracks all Instances/requests a model must generate output on.
requests = defaultdict(list)
# stores the amount to pad out reqs per req. type so that
# number of fwd passes per distributed rank is equal
padding_requests = defaultdict(int)
# get lists of group hierarchy and each type of request
task_hierarchy, eval_tasks = get_task_list(task_dict)
if not log_samples:
if not all(
"bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
for task_output in eval_tasks
):
raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
for task_output in eval_tasks:
task: Task = task_output.task
limit = get_sample_size(task, limit)
task.build_all_requests(
limit=limit,
rank=lm.rank,
world_size=lm.world_size,
cache_requests=cache_requests,
rewrite_requests_cache=rewrite_requests_cache,
)
docs[(task_name, doc_id)] = doc
ctx = task.fewshot_context(
doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
eval_logger.debug(
f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
)
reqs = task.construct_requests(doc, ctx)
if write_out:
prompt_details.append({"doc_id": doc_id})
# print the prompt for the first few documents
if doc_id < 1:
print(
f"Task: {task_name}; document {doc_id}; context prompt (starting on next line):\n{ctx}\n(end of prompt on previous line)"
print_writeout(task)
# aggregate Instances by LM method requested to get output.
for instance in task.instances:
reqtype = instance.request_type
requests[reqtype].append(instance)
if lm.world_size > 1:
instances_rnk = torch.tensor(len(task._instances), device=lm.device)
gathered_item = (
lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
)
print("Requests:", reqs)
if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
for i, req in enumerate(reqs):
requests[req.request_type].append(req)
# i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.request_type].append((i, task_name, doc, doc_id))
if write_out:
prompt_details[-1][f"prompt_{i}"] = "".join(
(map(lambda x: "".join(x), req.args))
# "multiple_choice" task types dispatch (several) "loglikelihood" request types
reqtype = (
"loglikelihood"
if task.OUTPUT_TYPE == "multiple_choice"
else task.OUTPUT_TYPE
)
# compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks)
numpad = max(gathered_item) - gathered_item[lm.rank]
# todo: may not account for padding in cases like SquadV2 which has multiple req types
padding_requests[reqtype] += numpad
if write_out:
write_out_info[task_name] = prompt_details
# Compare all tasks/sets at once to ensure a single training set scan
if decontaminate:
from lm_eval.decontamination.decontaminate import get_train_overlap
print("Finding train/test overlap, please wait...")
overlaps = get_train_overlap(
docs_for_decontamination, decontamination_ngrams_path, limit
)
# all responses for each (task, doc)
process_res_queue = collections.defaultdict(list)
### Run LM on inputs, get all outputs ###
# execute each type of request
for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a band-aid
# solution. we could also implement some kind of auto-grouping here;
# they should end up next to each other.
print("Running", reqtype, "requests")
resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [
x if req.index is None else x[req.index] for x, req in zip(resps, reqs)
]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
process_res_queue[(task_name, doc_id)].append((i, resp))
if write_out:
write_out_info[task_name][doc_id][f"logit_{i}"] = resp
task = task_dict[task_name]
if isinstance(task, lm_eval.base.MultipleChoiceTask):
write_out_info[task_name][doc_id]["truth"] = doc["gold"]
elif isinstance(task, lm_eval.tasks.winogrande.Winogrande):
write_out_info[task_name][doc_id]["truth"] = task.answer_to_num[
doc["answer"]
]
else:
write_out_info[task_name][doc_id]["truth"] = task.doc_to_target(doc)
vals = collections.defaultdict(list)
# unpack results and sort back in order and return control to Task
for (task_name, doc_id), requests in process_res_queue.items():
requests.sort(key=lambda x: x[0])
requests = [x[1] for x in requests]
task = task_dict[task_name]
doc = docs[(task_name, doc_id)]
metrics = task.process_results(doc, requests)
eval_logger.info(f"Running {reqtype} requests")
# create `K` copies of each request `req` based off `K = req.repeats`
cloned_reqs = []
for req in reqs:
cloned_reqs.extend([req] * req.repeats)
if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
for _ in range(padding_requests[reqtype]):
cloned_reqs.extend([req] * req.repeats)
# run requests through model
resps = getattr(lm, reqtype)(cloned_reqs)
# put responses from model into a list of length K for each request.
for x, req in zip(resps, cloned_reqs):
req.resps.append(x)
if lm.world_size > 1:
lm.accelerator.wait_for_everyone()
RANK = lm.rank
WORLD_SIZE = lm.world_size
### Postprocess outputs ###
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
for task_output in eval_tasks:
task = task_output.task
task.apply_filters()
### Collect values of metrics on all datapoints ###
# # unpack results and sort back in order and return control to Task
# TODO: make it possible to use a different metric per filter
# Pre-process task.instances to group by doc_id
instances_by_doc_id = defaultdict(list)
for instance in task.instances:
instances_by_doc_id[instance.doc_id].append(instance)
# Sort instances within each group
for instances in instances_by_doc_id.values():
instances.sort(key=lambda x: x.idx)
# iterate over different filters used
for filter_key in task.instances[0].filtered_resps.keys():
doc_iterator = task.doc_iterator(
rank=RANK, limit=limit, world_size=WORLD_SIZE
)
for doc_id, doc in doc_iterator:
requests = instances_by_doc_id[doc_id]
metrics = task.process_results(
doc, [req.filtered_resps[filter_key] for req in requests]
)
if log_samples:
target = task.doc_to_target(doc)
example = {
"doc_id": doc_id,
"doc": doc,
"target": target,
"arguments": [req.args for req in requests],
"resps": [req.resps for req in requests],
"filtered_resps": [
req.filtered_resps[filter_key] for req in requests
],
"doc_hash": hash_string(
json.dumps(
requests[0].doc,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
),
"prompt_hash": hash_string(requests[0].arguments[0]),
"target_hash": hash_string(str(target)),
}
example.update(metrics)
task_output.logged_samples.append(example)
for metric, value in metrics.items():
vals[(task_name, metric)].append(value)
if write_out:
write_out_info[task_name][doc_id][metric] = str(value)
# Re-use the evaluation for the decontaminated set by just ignoring the overlaps
if decontaminate and task_name in overlaps:
if doc_id not in overlaps[task_name]:
vals[(task_name, metric + decontaminate_suffix)].append(value)
# aggregate results
for (task_name, metric), items in vals.items():
task = task_dict[task_name]
real_metric = metric # key when looking up the metric with task.aggregation
if metric.endswith(decontaminate_suffix):
real_metric = metric.replace(
decontaminate_suffix, ""
) # decontaminated still uses the same metric
results[task_name][metric] = task.aggregation()[real_metric](items)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
stderr = lm_eval.metrics.stderr_for_metric(
metric=task.aggregation()[real_metric],
bootstrap_iters=min(bootstrap_iters, 1000)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
task_output.sample_metrics[(metric, filter_key)].append(value)
if WORLD_SIZE > 1:
# if multigpu, then gather data across all ranks to rank 0
# first gather logged samples across all ranks
for task_output in eval_tasks:
if log_samples:
# for task_name, task_samples in list(samples.items()):
full_samples = [None] * WORLD_SIZE if RANK == 0 else None
torch.distributed.gather_object(
obj=task_output.logged_samples,
object_gather_list=full_samples,
dst=0,
)
if stderr is not None:
results[task_name][metric + "_stderr"] = stderr(items)
if write_out:
import json
import pathlib
output_base_path = (
pathlib.Path(output_base_path)
if output_base_path is not None
else pathlib.Path(".")
if RANK == 0:
task_output.logged_samples = list(
itertools.chain.from_iterable(full_samples)
)
try:
output_base_path.mkdir(parents=True, exist_ok=False)
except FileExistsError:
pass
for task_name, _ in task_dict_items:
with open(
output_base_path.joinpath(f"{task_name}_write_out_info.json"),
"w",
encoding="utf8",
) as fp:
json.dump(write_out_info[task_name], fp, indent=4, ensure_ascii=False)
return {"results": dict(results), "versions": dict(versions)}
# then collect metrics across all ranks
for metrics in task_output.sample_metrics:
metric_list = [None] * WORLD_SIZE if RANK == 0 else None
torch.distributed.gather_object(
obj=task_output.sample_metrics[metrics],
object_gather_list=metric_list,
dst=0,
)
if RANK == 0:
task_output.sample_metrics[metrics] = list(
itertools.chain.from_iterable(metric_list)
)
def make_table(result_dict):
"""Generate table of results."""
from pytablewriter import MarkdownTableWriter, LatexTableWriter
if RANK == 0:
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
for task_output in eval_tasks:
task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
results, samples, configs, versions, num_fewshot = consolidate_results(
eval_tasks
)
md_writer = MarkdownTableWriter()
latex_writer = LatexTableWriter()
md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
### Calculate group metrics ###
if bool(results):
for group, task_list in reversed(task_hierarchy.items()):
if len(task_list) == 0:
# task_hierarchy entries are either
# `group_name: [subtask1, subtask2, ...]`
# or `task_name: []`.
# we only want to operate on groups here.
continue
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
values = []
# compute group's pooled metric and stderr
results[group][
metric
] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][
stderr
] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results[group]["samples"] = sum(sizes)
results_agg = defaultdict(dict)
groups_agg = defaultdict(dict)
all_tasks_list = list(task_hierarchy.keys())
while True:
add_tasks_list = list(k for k in results_agg.keys())
left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
if len(left_tasks_list) == 0:
break
_task_hierarchy = {
k: v for k, v in task_hierarchy.items() if k in left_tasks_list
}
_results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}
for group_name, task_list in task_hierarchy.items():
if task_list:
num_fewshot[group_name] = num_fewshot[
task_list[0]
] # TODO: validate this
results_dict = {
"results": dict(results_agg.items()),
**({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
"group_subtasks": dict(reversed(task_hierarchy.items())),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())),
"n-samples": {
task_output.task_name: {
"original": len(task_output.task.eval_docs),
"effective": min(
limit if limit else len(task_output.task.eval_docs),
len(task_output.task.eval_docs),
),
}
for task_output in eval_tasks
},
}
if log_samples:
results_dict["samples"] = dict(samples)
for k, dic in result_dict["results"].items():
version = result_dict["versions"][k]
for m, v in dic.items():
if m.endswith("_stderr"):
continue
return results_dict
if m + "_stderr" in dic:
se = dic[m + "_stderr"]
values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se])
else:
values.append([k, version, m, "%.4f" % v, "", ""])
k = ""
version = ""
md_writer.value_matrix = values
latex_writer.value_matrix = values
return None
# todo: make latex table look good
# print(latex_writer.dumps())
def request_caching_arg_to_dict(cache_requests: str) -> dict:
request_caching_args = {
"cache_requests": cache_requests in {"true", "refresh"},
"rewrite_requests_cache": cache_requests == "refresh",
"delete_requests_cache": cache_requests == "delete",
}
return md_writer.dumps()
return request_caching_args
import collections
import math
import pathlib
import sys
from typing import Dict, List, Optional, Tuple, Union
from lm_eval.api import metrics
from lm_eval.utils import eval_logger, positional_deprecated
class TaskOutput:
"""
Wrapper class for Task outputs.It contains various attributes and methods to manage and calculate metrics for the task.
Attributes:
task (object): The task object.
task_name (str): The name of the task.
task_config (dict): The configuration of the task.
version (str): The version of the task.
group_name (str): The name of the task group.
n_shot (int): The number of shots for the task.
task_alias (str): The alias of the task.
group_alias (str): The alias of the task group.
is_group (bool): Indicates if the task is a group.
logged_samples (list): The list of logged samples.
sample_len (int): The length of the samples.
sample_metrics (defaultdict): The dictionary of samples' metrics.
agg_metrics (defaultdict): The dictionary of aggregate metrics.
Methods:
from_taskdict(cls, task_name: str, task):
Creates a TaskOutput instance from a task dictionary.
calculate_aggregate_metric(bootstrap_iters=100000) -> None:
Calculates the aggregate metrics for the task.
"""
def __init__(
self,
task=None,
task_name=None,
task_config=None,
version=None,
group_name=None,
n_shot=None,
task_alias=None,
group_alias=None,
is_group=None,
):
self.task = task
self.task_config = task_config
self.task_name = task_name
self.group_name = group_name
self.version = version
self.n_shot = n_shot
self.task_alias = task_alias
self.group_alias = group_alias
self.is_group = is_group
self.logged_samples = []
self.sample_len = None
self.sample_metrics = collections.defaultdict(list)
self.agg_metrics = collections.defaultdict(list)
@classmethod
def from_taskdict(cls, task_name: str, task):
if isinstance(task, tuple):
group_name, task = task
else:
group_name = None
if not task:
# these gets filtered out in get_task_list
# once they are added to group hierarchy
is_group = True
return cls(
task=task, task_name=task_name, is_group=is_group, group_name=group_name
)
version = task.VERSION
task_config = dict(task.dump_config())
if (n_shot := task_config.get("num_fewshot")) == 0:
n_shot = task_config.get("metadata", {}).get("num_fewshot", 0)
task_alias = task_config.get("alias")
group_alias = task_config.get("group_alias")
return cls(
task=task,
task_name=task_name,
task_config=task_config,
group_name=group_name,
version=version,
n_shot=n_shot,
task_alias=task_alias,
group_alias=group_alias,
)
def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None:
for (metric, filter_key), items in self.sample_metrics.items():
agg_fn = self.task.aggregation()[metric]
metric_key = f"{metric},{filter_key}"
self.agg_metrics[metric_key] = agg_fn(items)
self.sample_len = len(items) # TODO: same sample size for each metric?
if bootstrap_iters:
stderr_fn = metrics.stderr_for_metric(
metric=agg_fn,
bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
)
self.agg_metrics[f"{metric}_stderr,{filter_key}"] = (
stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
)
def __repr__(self):
return (
f"TaskOutput(task_name={self.task_name}, "
f"group_name={self.group_name}, "
f"version={self.version},"
f"n_shot={self.n_shot}"
f"task_alias={self.task_alias}, group_alias={self.group_alias})"
)
def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]:
task_hierarchy = collections.defaultdict(list)
outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items())
for task_output in outputs:
if group_name := task_output.group_name:
task_hierarchy[group_name].append(task_output.task_name)
else:
task_hierarchy[task_output.task_name] = []
# returns task_hierarchy tracking which groups contain which subtasks,
# and a list of TaskOutput classes for each non-group subtask
return task_hierarchy, [x for x in outputs if x.task]
def print_writeout(task) -> None:
for inst in task.instances:
# print the prompt for the first few documents
if inst.doc_id < 1:
eval_logger.info(
f"Task: {task}; document {inst.doc_id}; context prompt (starting on next line):\
\n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
)
eval_logger.info(f"Request: {str(inst)}")
def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
if limit is not None:
limit = (
int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit)
)
return limit
def prepare_print_tasks(
task_hierarchy: dict, results: dict, tab=0
) -> Tuple[dict, dict]:
"""
@param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
value is a list of task names.
@param results: Dictionary containing the results of each task. Each key is a
group name and its value is a dictionary of task results.
@param tab: The indentation level for printing the task
hierarchy. Default is 0.
@return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
aggregated results for each task, and groups_agg contains aggregated results for each group.
Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
"""
results_agg = collections.defaultdict(dict)
groups_agg = collections.defaultdict(dict)
(group_name, task_list), *_ = task_hierarchy.items()
task_list = sorted(task_list)
results_agg[group_name] = results[group_name].copy()
# results_agg[group_name]["tab"] = tab
if "samples" in results_agg[group_name]:
results_agg[group_name].pop("samples")
tab_string = " " * tab + "- " if tab > 0 else ""
if "alias" in results_agg[group_name]:
results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"]
else:
results_agg[group_name]["alias"] = tab_string + group_name
if len(task_list) > 0:
groups_agg[group_name] = results[group_name].copy()
# groups_agg[group_name]["tab"] = tab
if "samples" in groups_agg[group_name]:
groups_agg[group_name].pop("samples")
if "alias" in groups_agg[group_name]:
groups_agg[group_name]["alias"] = (
tab_string + groups_agg[group_name]["alias"]
)
else:
groups_agg[group_name]["alias"] = tab_string + group_name
for task_name in task_list:
if task_name in task_hierarchy:
_task_hierarchy = {
**{task_name: task_hierarchy[task_name]},
**task_hierarchy,
}
else:
_task_hierarchy = {
**{task_name: []},
**task_hierarchy,
}
_results_agg, _groups_agg = prepare_print_tasks(
_task_hierarchy, results, tab + 1
)
results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}
return results_agg, groups_agg
def consolidate_results(
eval_tasks: List[TaskOutput],
) -> Tuple[dict, dict, dict, dict, dict]:
"""
@param eval_tasks: list(TaskOutput).
@return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
Consolidates the results of multiple evaluation tasks into a single structure.
The method iterates over each evaluation instance and extracts relevant information to create the consolidated
results structure. The consolidated results structure has the following properties:
- results: A defaultdict with task names as keys and dictionaries as values. Each dictionary contains
metric/filter pairs as keys and corresponding metric values as values. The "alias" key is used to store task
aliases specified in the task configuration.
- samples: A defaultdict with task names as keys and lists of log samples as values.
- configs: A defaultdict with task names as keys and task configurations as values.
- versions: A defaultdict with task names as keys and task versions as values.
- num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
"""
# stores the final result for each task, for each metric/filter pair.
results = collections.defaultdict(dict)
# logs info about each document evaluated.
samples = collections.defaultdict(list)
# store num-fewshot value per task
num_fewshot = collections.defaultdict(int)
# Tracks the YAML configs of all chosen task
configs = collections.defaultdict(dict)
# Tracks each task's version.
versions = collections.defaultdict(dict)
for task_output in eval_tasks:
if "task_alias" in (task_config := task_output.task_config):
results[task_output.task_name]["alias"] = task_config["task_alias"]
if group_alias := task_output.group_alias:
if group_alias not in results and (group_name := task_output.group_name):
results[group_name]["alias"] = group_alias
num_fewshot[task_output.task_name] = task_output.n_shot
configs[task_output.task_name] = task_output.task_config
versions[task_output.task_name] = task_output.version
samples[task_output.task_name] = task_output.logged_samples
for (metric, filter_key), items in task_output.sample_metrics.items():
metric_key = f"{metric},{filter_key}"
results[task_output.task_name][metric_key] = task_output.agg_metrics[
metric_key
]
results[task_output.task_name]["samples"] = task_output.sample_len
results[task_output.task_name][
f"{metric}_stderr,{filter_key}"
] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
return results, samples, configs, versions, num_fewshot
@positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
"""
Search upward in the directory tree to a maximum of three layers
to find and return the package root (containing the 'tests' folder)
"""
cur_path = start_path.resolve()
max_layers = 3
for _ in range(max_layers):
if (cur_path / "tests" / "test_version_stable.py").exists():
return cur_path
else:
cur_path = cur_path.parent.resolve()
raise FileNotFoundError(
f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
)
@positional_deprecated
def run_task_tests(task_list: List[str]):
"""
Find the package root and run the tests for the given tasks
"""
import pytest
package_root = find_test_root(start_path=pathlib.Path(__file__))
task_string = " or ".join(task_list)
args = [
f"{package_root}/tests/test_version_stable.py",
f"--rootdir={package_root}",
"-k",
f"{task_string}",
]
sys.path.append(str(package_root))
pytest_return_val = pytest.main(args)
if pytest_return_val:
raise ValueError(
f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
)
from functools import partial
from typing import List
from lm_eval.api.filter import FilterEnsemble
from lm_eval.api.registry import get_filter
from . import extraction, selection, transformation
def build_filter_ensemble(
filter_name: str, components: List[List[str]]
) -> FilterEnsemble:
"""
Create a filtering pipeline.
"""
filters = []
for function, kwargs in components:
if kwargs is None:
kwargs = {}
# create a filter given its name in the registry
f = partial(get_filter(function), **kwargs)
# add the filter as a pipeline step
filters.append(f)
return FilterEnsemble(name=filter_name, filters=filters)
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("decontaminate")
class DecontaminationFilter(Filter):
"""
A filter which evaluates
"""
name = "track_decontamination"
def __init__(self, path) -> None:
"""
TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
should further cache result on a given (task_name, doc_id)
"""
self._decontam_results = None
def apply(self, resps, docs) -> None:
"""
Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
"""
pass
import re
import sys
import unicodedata
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("regex")
class RegexFilter(Filter):
""" """
def __init__(
self,
regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
group_select=0,
fallback: str = "[invalid]",
) -> None:
"""
pass a string `regex` to run `re.compile(r"regex")` on.
`fallback` defines the output returned if no matches for the regex are located.
"""
self.regex_pattern = regex_pattern
self.regex = re.compile(regex_pattern)
self.group_select = group_select
self.fallback = fallback
def apply(self, resps, docs):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
# independently (and keep them a list.)
def filter_set(inst):
filtered = []
for resp in inst:
match = self.regex.findall(resp)
if match:
match = match[self.group_select]
if isinstance(match, tuple):
match = [m for m in match if m][0]
match = match.strip()
else:
match = self.fallback
filtered.append(match)
return filtered
# print(resps)
filtered_resps = list(map(lambda x: filter_set(x), resps))
# print(filtered_resps)
return filtered_resps
@register_filter("remove_whitespace")
class WhitespaceFilter(Filter):
""" """
def __init__(self) -> None:
pass
def apply(self, resps, docs):
def filter_set(inst):
filtered_resp = []
for resp in inst:
if resp.startswith(" "):
resp = resp[1:]
filtered_resp.append(resp)
return filtered_resp
filtered_resps = [filter_set(resp) for resp in resps]
return filtered_resps
@register_filter("multi_choice_regex")
class MultiChoiceRegexFilter(RegexFilter):
"""
A filter used to extract a model's answer on multiple choice questions with
letter answers. assumes each document has a "choices" field
containing the list of answer choices and that the answer label symbols
are of the form (A), (B), (C), ... or A, B, C.
"""
def __init__(
self,
regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
group_select=0,
fallback: str = "[invalid]",
ignore_case=False,
ignore_punctuation=False,
regexes_to_ignore=None,
) -> None:
"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
group_select: Selects the (group_select)th match from the findall result.
ignore_case: Ignores the case during step 1 matching
ignore_punctuation: Remove the punctuation during step 1 matching
regexes_to_ignore: Remove these regexes during step 1 matching
"""
super().__init__(regex_pattern, group_select, fallback)
self.ignore_case = ignore_case
self.ignore_punctuation = ignore_punctuation
self.regexes_to_ignore = regexes_to_ignore
def apply(self, resps, docs):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
# independently (and keep them a list.)
def find_match(regex, resp, convert_dict={}):
match = regex.findall(resp)
if match:
match = match[self.group_select]
if isinstance(match, tuple):
match = [m for m in match if m][0]
match = match.strip()
if match and match in convert_dict:
match = convert_dict[match]
return match
punct_tbl = dict.fromkeys(
i
for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith("P")
)
def filter_ignores(st):
if self.regexes_to_ignore is not None:
for s in self.regexes_to_ignore:
st = re.sub(s, "", st)
if self.ignore_case:
st = st.lower()
if self.ignore_punctuation:
# https://stackoverflow.com/a/266162
st = st.translate(punct_tbl)
return st
filtered_resps = []
for r, doc in zip(resps, docs):
fallback_regexes = []
choice_to_alpha = {}
next_alpha = "A"
without_paren_fallback_regexes = []
without_paren_to_target = {}
choices = doc["choices"]
for c in choices:
m = filter_ignores(c.strip())
fallback_regexes.append(f"{re.escape(m)}")
choice_to_alpha[m] = f"({next_alpha})"
without_paren_fallback_regexes.append(next_alpha)
without_paren_to_target[next_alpha] = f"({next_alpha})"
next_alpha = chr(ord(next_alpha) + 1)
fallback_regex = re.compile("|".join(fallback_regexes))
without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile(
f":[\s]*({without_paren_fallback_regex})"
)
filtered = []
for resp in r:
match = find_match(self.regex, resp)
if not match:
match = find_match(
fallback_regex, filter_ignores(resp), choice_to_alpha
)
if not match:
match = find_match(
without_paren_fallback_regex, resp, without_paren_to_target
)
if not match:
match = self.fallback
filtered.append(match)
filtered_resps.append(filtered)
return filtered_resps
from collections import Counter
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
# TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function
# that takes an input and returns a scalar and then should select the max reward,
# or should implement different filters for different ways of handling a reward model's inference.
@register_filter("take_first")
class TakeFirstFilter(Filter):
def __init__(self) -> None:
"""
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""
def apply(self, resps, docs):
"""
Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
"""
return map(lambda r: r[0], resps)
@register_filter("take_first_k")
class TakeKFilter(Filter):
def __init__(self, **kwargs) -> None:
self.k = kwargs.pop("k")
super().__init__(**kwargs)
def apply(self, resps, docs):
# need resp to be subscriptable to check below
resps = list(resps)
# check we have at least k responses per doc, else we can't take the first k
assert (
len(resps[0]) >= self.k
), f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
return map(lambda r: r[: self.k], resps)
@register_filter("majority_vote")
class MajorityVoteFilter(Filter):
def __init__(self) -> None:
"""
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""
def apply(self, resps, docs):
"""
Each entry of `resps` is a list of model responses.
We select the response that occurs most frequently in each entry of `resps`.
"""
def select_majority(resp):
counts = Counter(resp)
vote = counts.most_common(1)[0][0]
return vote
return map(lambda r: [select_majority(r)], resps)
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("lowercase")
class LowercaseFilter(Filter):
def __init__(self) -> None:
pass
def apply(self, resps, docs):
def filter_set(inst):
return [resp.lower() for resp in inst]
return [filter_set(resp) for resp in resps]
@register_filter("uppercase")
class UppercaseFilter(Filter):
def __init__(self) -> None:
pass
def apply(self, resps, docs):
def filter_set(inst):
return [resp.upper() for resp in inst]
return [filter_set(resp) for resp in resps]
@register_filter("map")
class MapFilter(Filter):
def __init__(self, mapping_dict: dict = None, default_value=None) -> None:
"""
Initializes the MapFilter with a given mapping dictionary and default value.
Args:
- mapping_dict (dict): A dictionary containing the key-value mappings.
Default is an empty dictionary.
- default_value (Any): The value to be returned when a key is not found in the mapping_dict.
Default is None.
Example:
mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
"""
if mapping_dict is None:
mapping_dict = {}
assert isinstance(
mapping_dict, dict
), "Provided mapping_dict is not a dictionary"
self.mapping_dict = mapping_dict
self.default_value = default_value
def apply(self, resps, docs):
def filter_set(inst):
return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
return [filter_set(resp) for resp in resps]
from .evaluation_tracker import EvaluationTracker
from .wandb_logger import WandbLogger
import json
import re
import time
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from huggingface_hub import HfApi
from lm_eval.utils import (
eval_logger,
handle_non_serializable,
hash_string,
)
@dataclass(init=False)
class GeneralConfigTracker:
"""
Tracker for the evaluation parameters.
Attributes:
model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.)
model_name (str): Name of the model.
model_name_sanitized (str): Sanitized model name for directory creation.
start_time (float): Start time of the experiment. Logged at class init.
end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`]
total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times).
"""
model_source: str = None
model_name: str = None
model_name_sanitized: str = None
start_time: float = None
end_time: float = None
total_evaluation_time_seconds: str = None
def __init__(self) -> None:
"""Starts the evaluation timer."""
self.start_time = time.perf_counter()
@staticmethod
def _get_model_name(model_args: str) -> str:
"""Extracts the model name from the model arguments."""
def extract_model_name(model_args: str, key: str) -> str:
"""Extracts the model name from the model arguments using a key."""
args_after_key = model_args.split(key)[1]
return args_after_key.split(",")[0]
# order does matter, e.g. peft and delta are provided together with pretrained
prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
for prefix in prefixes:
if prefix in model_args:
return extract_model_name(model_args, prefix)
return ""
def log_experiment_args(
self,
model_source: str,
model_args: str,
) -> None:
"""Logs model parameters and job ID."""
self.model_source = model_source
self.model_name = GeneralConfigTracker._get_model_name(model_args)
self.model_name_sanitized = re.sub(
r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
)
def log_end_time(self) -> None:
"""Logs the end time of the evaluation and calculates the total evaluation time."""
self.end_time = time.perf_counter()
self.total_evaluation_time_seconds = str(self.end_time - self.start_time)
class EvaluationTracker:
"""
Keeps track and saves relevant information of the evaluation process.
Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested.
"""
def __init__(
self,
output_path: str = None,
hub_results_org: str = "",
hub_repo_name: str = "",
push_results_to_hub: bool = False,
push_samples_to_hub: bool = False,
public_repo: bool = False,
token: str = "",
) -> None:
"""
Creates all the necessary loggers for evaluation tracking.
Args:
output_path (str): Path to save the results. If not provided, the results won't be saved.
hub_results_org (str): The Hugging Face organisation to push the results to. If not provided, the results won't be pushed.
hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
public_repo (bool): Whether to push the results to a public or private repository.
token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
"""
self.general_config_tracker = GeneralConfigTracker()
self.output_path = output_path
self.hub_results_org = hub_results_org
hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
self.push_results_to_hub = push_results_to_hub
self.push_samples_to_hub = push_samples_to_hub
self.public_repo = public_repo
self.api = HfApi(token=token) if token else None
def save_results_aggregated(
self,
results: dict,
samples: dict,
) -> None:
"""
Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested.
Args:
results (dict): The aggregated results to save.
samples (dict): The samples results to save.
"""
self.general_config_tracker.log_end_time()
if self.output_path:
try:
eval_logger.info("Saving results aggregated")
# calculate cumulative hash for each task - only if samples are provided
task_hashes = {}
if samples:
for task_name, task_samples in samples.items():
sample_hashes = [
s["doc_hash"] + s["prompt_hash"] + s["target_hash"]
for s in task_samples
]
task_hashes[task_name] = hash_string("".join(sample_hashes))
# update initial results dict
results.update({"task_hashes": task_hashes})
results.update(asdict(self.general_config_tracker))
dumped = json.dumps(
results,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
path = Path(self.output_path if self.output_path else Path.cwd())
path = path.joinpath(self.general_config_tracker.model_name_sanitized)
path.mkdir(parents=True, exist_ok=True)
self.date_id = datetime.now().isoformat().replace(":", "-")
file_results_aggregated = path.joinpath(f"results_{self.date_id}.json")
file_results_aggregated.open("w", encoding="utf-8").write(dumped)
if self.api and self.push_results_to_hub:
self.api.create_repo(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
repo_type="dataset",
commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
)
except Exception as e:
eval_logger.warning("Could not save results aggregated")
eval_logger.info(repr(e))
else:
eval_logger.info(
"Output path not provided, skipping saving results aggregated"
)
def save_results_samples(
self,
task_name: str,
samples: dict,
) -> None:
"""
Saves the samples results to the output path and pushes them to the Hugging Face hub if requested.
Args:
task_name (str): The task name to save the samples for.
samples (dict): The samples results to save.
"""
if self.output_path:
try:
eval_logger.info("Saving samples results")
samples_dumped = json.dumps(
samples,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
path = Path(self.output_path if self.output_path else Path.cwd())
path = path.joinpath(self.general_config_tracker.model_name_sanitized)
path.mkdir(parents=True, exist_ok=True)
file_results_samples = path.joinpath(
f"samples_{task_name}_{self.date_id}.json"
)
file_results_samples.write_text(samples_dumped, encoding="utf-8")
if self.api and self.push_samples_to_hub:
self.api.create_repo(
self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
repo_type="dataset",
commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
)
except Exception as e:
eval_logger.warning("Could not save sample results")
eval_logger.info(repr(e))
else:
eval_logger.info("Output path not provided, skipping saving sample results")
import logging
import os
import re
import subprocess
from pathlib import Path
from typing import Any, Dict, Optional, Tuple, Union
import numpy as np
from torch.utils.collect_env import get_pretty_env_info
from transformers import __version__ as trans_version
logger = logging.getLogger(__name__)
def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
"""Remove the ',none' substring from the input_string if it exists at the end.
Args:
input_string (str): The input string from which to remove the ',none' substring.
Returns:
Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
and a boolean indicating whether the modification was made (True) or not (False).
"""
# Define the pattern to match ',none' at the end of the string
pattern = re.compile(r",none$")
# Use sub() to replace ',none' with an empty string
result = re.sub(pattern, "", input_string)
# check if the input_string changed
removed = result != input_string
return result, removed
def _handle_non_serializable(o: Any) -> Union[int, str, list]:
"""Handle non-serializable objects by converting them to serializable types.
Args:
o (Any): The object to be handled.
Returns:
Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
it will be converted to int. If the object is of type set, it will be converted
to a list. Otherwise, it will be converted to str.
"""
if isinstance(o, np.int64) or isinstance(o, np.int32):
return int(o)
elif isinstance(o, set):
return list(o)
else:
return str(o)
def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
try:
git_folder = Path(repo_path, ".git")
if git_folder.is_file():
git_folder = Path(
git_folder.parent,
git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
)
if Path(git_folder, "HEAD").exists():
head_name = (
Path(git_folder, "HEAD")
.read_text(encoding="utf-8")
.split("\n")[0]
.split(" ")[-1]
)
head_ref = Path(git_folder, head_name)
git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
else:
git_hash = None
except Exception as err:
logger.debug(
f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
)
return None
return git_hash
def get_git_commit_hash():
"""
Gets the git commit hash of your current repo (if it exists).
Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
"""
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except (subprocess.CalledProcessError, FileNotFoundError):
# FileNotFoundError occurs when git not installed on system
git_hash = get_commit_from_path(os.getcwd()) # git hash of repo if exists
return git_hash
def add_env_info(storage: Dict[str, Any]):
try:
pretty_env_info = get_pretty_env_info()
except Exception as err:
pretty_env_info = str(err)
transformers_version = trans_version
upper_dir_commit = get_commit_from_path(
Path(os.getcwd(), "..")
) # git hash of upper repo if exists
added_info = {
"pretty_env_info": pretty_env_info,
"transformers_version": transformers_version,
"upper_git_hash": upper_dir_commit, # in case this repo is submodule
}
storage.update(added_info)
import copy
import json
import logging
from typing import Any, Dict, List, Literal, Tuple
import numpy as np
import pandas as pd
from packaging.version import Version
from lm_eval.logging.utils import _handle_non_serializable, remove_none_pattern
logger = logging.getLogger(__name__)
def get_wandb_printer() -> Literal["Printer"]:
"""Returns a wandb printer instance for pretty stdout."""
from wandb.sdk.lib.printer import get_printer
from wandb.sdk.wandb_settings import Settings
printer = get_printer(Settings()._jupyter)
return printer
class WandbLogger:
def __init__(self, **kwargs) -> None:
"""Attaches to wandb logger if already initialized. Otherwise, passes kwargs to wandb.init()
Args:
kwargs Optional[Any]: Arguments for configuration.
Parse and log the results returned from evaluator.simple_evaluate() with:
wandb_logger.post_init(results)
wandb_logger.log_eval_result()
wandb_logger.log_eval_samples(results["samples"])
"""
try:
import wandb
assert Version(wandb.__version__) >= Version("0.13.6")
if Version(wandb.__version__) < Version("0.13.6"):
wandb.require("report-editing:v0")
except Exception as e:
logger.warning(
"To use the wandb reporting functionality please install wandb>=0.13.6.\n"
"To install the latest version of wandb run `pip install wandb --upgrade`\n"
f"{e}"
)
self.wandb_args: Dict[str, Any] = kwargs
# initialize a W&B run
if wandb.run is None:
self.run = wandb.init(**self.wandb_args)
else:
self.run = wandb.run
self.printer = get_wandb_printer()
def post_init(self, results: Dict[str, Any]) -> None:
self.results: Dict[str, Any] = copy.deepcopy(results)
self.task_names: List[str] = list(results.get("results", {}).keys())
self.group_names: List[str] = list(results.get("groups", {}).keys())
def _get_config(self) -> Dict[str, Any]:
"""Get configuration parameters."""
self.task_configs = self.results.get("configs", {})
cli_configs = self.results.get("config", {})
configs = {
"task_configs": self.task_configs,
"cli_configs": cli_configs,
}
return configs
def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]:
"""Sanitize the results dictionary."""
_results = copy.deepcopy(self.results.get("results", dict()))
# Remove None from the metric string name
tmp_results = copy.deepcopy(_results)
for task_name in self.task_names:
task_result = tmp_results.get(task_name, dict())
for metric_name, metric_value in task_result.items():
_metric_name, removed = remove_none_pattern(metric_name)
if removed:
_results[task_name][_metric_name] = metric_value
_results[task_name].pop(metric_name)
# remove string valued keys from the results dict
wandb_summary = {}
for task in self.task_names:
task_result = _results.get(task, dict())
for metric_name, metric_value in task_result.items():
if isinstance(metric_value, str):
wandb_summary[f"{task}/{metric_name}"] = metric_value
for summary_metric, summary_value in wandb_summary.items():
_task, _summary_metric = summary_metric.split("/")
_results[_task].pop(_summary_metric)
tmp_results = copy.deepcopy(_results)
for task_name, task_results in tmp_results.items():
for metric_name, metric_value in task_results.items():
_results[f"{task_name}/{metric_name}"] = metric_value
_results[task_name].pop(metric_name)
for task in self.task_names:
_results.pop(task)
return wandb_summary, _results
def _log_results_as_table(self) -> None:
"""Generate and log evaluation results as a table to W&B."""
columns = [
"Version",
"Filter",
"num_fewshot",
"Metric",
"Value",
"Stderr",
]
def make_table(columns: List[str], key: str = "results"):
import wandb
table = wandb.Table(columns=columns)
results = copy.deepcopy(self.results)
for k, dic in results.get(key).items():
if k in self.group_names and not key == "groups":
continue
version = results.get("versions").get(k)
if version == "N/A":
version = None
n = results.get("n-shot").get(k)
for (mf), v in dic.items():
m, _, f = mf.partition(",")
if m.endswith("_stderr"):
continue
if m == "alias":
continue
if m + "_stderr" + "," + f in dic:
se = dic[m + "_stderr" + "," + f]
if se != "N/A":
se = "%.4f" % se
table.add_data(*[k, version, f, n, m, str(v), str(se)])
else:
table.add_data(*[k, version, f, n, m, str(v), ""])
return table
# log the complete eval result to W&B Table
table = make_table(["Tasks"] + columns, "results")
self.run.log({"evaluation/eval_results": table})
if "groups" in self.results.keys():
table = make_table(["Groups"] + columns, "groups")
self.run.log({"evaluation/group_eval_results": table})
def _log_results_as_artifact(self) -> None:
"""Log results as JSON artifact to W&B."""
import wandb
dumped = json.dumps(
self.results, indent=2, default=_handle_non_serializable, ensure_ascii=False
)
artifact = wandb.Artifact("results", type="eval_results")
with artifact.new_file("results.json", mode="w", encoding="utf-8") as f:
f.write(dumped)
self.run.log_artifact(artifact)
def log_eval_result(self) -> None:
"""Log evaluation results to W&B."""
# Log configs to wandb
configs = self._get_config()
self.run.config.update(configs)
wandb_summary, self.wandb_results = self._sanitize_results_dict()
# update wandb.run.summary with items that were removed
self.run.summary.update(wandb_summary)
# Log the evaluation metrics to wandb
self.run.log(self.wandb_results)
# Log the evaluation metrics as W&B Table
self._log_results_as_table()
# Log the results dict as json to W&B Artifacts
self._log_results_as_artifact()
def _generate_dataset(
self, data: List[Dict[str, Any]], config: Dict[str, Any]
) -> pd.DataFrame:
"""Generate a dataset from evaluation data.
Args:
data (List[Dict[str, Any]]): The data to generate a dataset for.
config (Dict[str, Any]): The configuration of the task.
Returns:
pd.DataFrame: A dataframe that is ready to be uploaded to W&B.
"""
ids = [x["doc_id"] for x in data]
labels = [x["target"] for x in data]
instance = [""] * len(ids)
resps = [""] * len(ids)
filtered_resps = [""] * len(ids)
model_outputs = {}
metrics_list = config["metric_list"]
metrics = {}
for metric in metrics_list:
metric = metric.get("metric")
if metric in ["word_perplexity", "byte_perplexity", "bits_per_byte"]:
metrics[f"{metric}_loglikelihood"] = [x[metric][0] for x in data]
if metric in ["byte_perplexity", "bits_per_byte"]:
metrics[f"{metric}_bytes"] = [x[metric][1] for x in data]
else:
metrics[f"{metric}_words"] = [x[metric][1] for x in data]
else:
metrics[metric] = [x[metric] for x in data]
if config["output_type"] == "loglikelihood":
instance = [x["arguments"][0][0] for x in data]
labels = [x["arguments"][0][1] for x in data]
resps = [
f'log probability of continuation is {x["resps"][0][0][0]} '
+ "\n\n"
+ "continuation will {} generated with greedy sampling".format(
"not be" if not x["resps"][0][0][1] else "be"
)
for x in data
]
filtered_resps = [
f'log probability of continuation is {x["filtered_resps"][0][0]} '
+ "\n\n"
+ "continuation will {} generated with greedy sampling".format(
"not be" if not x["filtered_resps"][0][1] else "be"
)
for x in data
]
elif config["output_type"] == "multiple_choice":
instance = [x["arguments"][0][0] for x in data]
choices = [
"\n".join([f"{idx}. {y[1]}" for idx, y in enumerate(x["arguments"])])
for x in data
]
resps = [np.argmax([n[0][0] for n in x["resps"]]) for x in data]
filtered_resps = [
np.argmax([n[0] for n in x["filtered_resps"]]) for x in data
]
elif config["output_type"] == "loglikelihood_rolling":
instance = [x["arguments"][0][0] for x in data]
resps = [x["resps"][0][0] for x in data]
filtered_resps = [x["filtered_resps"][0] for x in data]
elif config["output_type"] == "generate_until":
instance = [x["arguments"][0][0] for x in data]
resps = [x["resps"][0][0] for x in data]
filtered_resps = [x["filtered_resps"][0] for x in data]
model_outputs["raw_predictions"] = resps
model_outputs["filtered_predictions"] = filtered_resps
df_data = {
"id": ids,
"data": instance,
}
if config["output_type"] == "multiple_choice":
df_data["choices"] = choices
tmp_data = {
"input_len": [len(x) for x in instance],
"labels": labels,
"output_type": config["output_type"],
}
df_data.update(tmp_data)
df_data.update(model_outputs)
df_data.update(metrics)
return pd.DataFrame(df_data)
def _log_samples_as_artifact(
self, data: List[Dict[str, Any]], task_name: str
) -> None:
import wandb
# log the samples as an artifact
dumped = json.dumps(
data,
indent=2,
default=_handle_non_serializable,
ensure_ascii=False,
)
artifact = wandb.Artifact(f"{task_name}", type="samples_by_task")
with artifact.new_file(
f"{task_name}_eval_samples.json", mode="w", encoding="utf-8"
) as f:
f.write(dumped)
self.run.log_artifact(artifact)
# artifact.wait()
def log_eval_samples(self, samples: Dict[str, List[Dict[str, Any]]]) -> None:
"""Log evaluation samples to W&B.
Args:
samples (Dict[str, List[Dict[str, Any]]]): Evaluation samples for each task.
"""
task_names: List[str] = [
x for x in self.task_names if x not in self.group_names
]
ungrouped_tasks = []
tasks_by_groups = {}
for task_name in task_names:
group_names = self.task_configs[task_name].get("group", None)
if group_names:
if isinstance(group_names, str):
group_names = [group_names]
for group_name in group_names:
if not tasks_by_groups.get(group_name):
tasks_by_groups[group_name] = [task_name]
else:
tasks_by_groups[group_name].append(task_name)
else:
ungrouped_tasks.append(task_name)
for task_name in ungrouped_tasks:
eval_preds = samples[task_name]
# log the samples as a W&B Table
df = self._generate_dataset(eval_preds, self.task_configs.get(task_name))
self.run.log({f"{task_name}_eval_results": df})
# log the samples as a json file as W&B Artifact
self._log_samples_as_artifact(eval_preds, task_name)
for group, grouped_tasks in tasks_by_groups.items():
grouped_df = pd.DataFrame()
for task_name in grouped_tasks:
eval_preds = samples[task_name]
df = self._generate_dataset(
eval_preds, self.task_configs.get(task_name)
)
df["group"] = group
df["task"] = task_name
grouped_df = pd.concat([grouped_df, df], ignore_index=True)
# log the samples as a json file as W&B Artifact
self._log_samples_as_artifact(eval_preds, task_name)
self.run.log({f"{group}_eval_results": grouped_df})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment