update version

a6b358ca · Rayyyyy · ed53d51c · ed53d51c · ed53d51c · ed53d51c
Commit a6b358ca authored May 24, 2024 by Rayyyyy
20 changed files
--- a/lm_eval/datasets/quac/quac.py
+++ b/lm_eval/datasets/quac/quac.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: Address all TODOs and remove all explanatory comments
-"""QuAC dataset."""
-
-
-import json
-
-import datasets
-
-
-_CITATION = """\
-@article{choi2018quac,
-    title={Quac: Question answering in context},
-    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
-    journal={arXiv preprint arXiv:1808.07036},
-    year={2018}
-}
-"""
-
-_DESCRIPTION = """\
-Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
-participating in information seeking dialog. Data instances consist of an interactive
-dialog between two crowd workers: (1) a student who poses a sequence of freeform
-questions to learn as much as possible about a hidden Wikipedia text, and (2)
-a teacher who answers the questions by providing short excerpts (spans) from the text.
-"""
-
-_HOMEPAGE = "https://quac.ai/"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-_URLS = {
-    "train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json",
-    "validation": "https://s3.amazonaws.com/my89public/quac/val_v0.2.json",
-}
-
-
-class Quac(datasets.GeneratorBasedBuilder):
-    """Question Answering in Context (QuAC) is a dataset for modeling, understanding, and  participating in information seeking dialog."""
-
-    VERSION = datasets.Version("1.1.0")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name="quac", version=VERSION, description="The QuAC dataset"
-        ),
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "title": datasets.Value("string"),
-                "section_title": datasets.Value("string"),
-                "paragraph": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "answer": datasets.Value("string"),
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
-        data_dir = dl_manager.download_and_extract(urls)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["train"],
-                    "split": "train",
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with open(filepath, encoding="utf-8") as f:
-            data = json.load(f)["data"]
-            key = 0
-            for row in data:
-                paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
-                qas = row["paragraphs"][0]["qas"]
-                qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
-                for (question, answer) in qa_pairs:
-                    # Yields examples as (key, example) tuples
-                    yield key, {
-                        "title": row["title"],
-                        "section_title": row["section_title"],
-                        "paragraph": paragraph,
-                        "question": question,
-                        "answer": answer,
-                    }
-                    key += 1
--- a/lm_eval/datasets/sat_analogies/__init__.py
+++ b/lm_eval/datasets/sat_analogies/__init__.py
--- a/lm_eval/datasets/sat_analogies/sat_analogies.py
+++ b/lm_eval/datasets/sat_analogies/sat_analogies.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""SAT Analogy Questions dataset."""
-
-
-import os
-
-import datasets
-
-
-_CITATION = """\
-@article{article,
-    author = {Turney, Peter},
-    year = {2006},
-    month = {09},
-    pages = {379-416},
-    title = {Similarity of Semantic Relations},
-    volume = {32},
-    journal = {Computational Linguistics},
-    doi = {10.1162/coli.2006.32.3.379}
-}
-"""
-
-_DESCRIPTION = """\
-SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
-multiple-choice analogy questions; 5 choices per question.
-"""
-
-_HOMEPAGE = "https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-
-class SatAnalogies(datasets.GeneratorBasedBuilder):
-    """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""
-
-    VERSION = datasets.Version("0.0.1")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name="sat_analogies",
-            version=VERSION,
-            description="The SAT Analogy Questions dataset",
-        ),
-    ]
-
-    @property
-    def manual_download_instructions(self):
-        return (
-            "To use SAT Analogy Questions you have to download it manually. Please "
-            "email Peter Turney to request the data (https://www.apperceptual.com). "
-            "Once you receive a download link for the dataset, supply the local path "
-            "as the `data_dir` arg: "
-            "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
-        )
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "source": datasets.Value("string"),
-                "stem": datasets.Value("string"),
-                "choices": datasets.features.Sequence(datasets.Value("string")),
-                "solution": datasets.Value("string"),
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
-        if not os.path.exists(data_dir):
-            raise FileNotFoundError(
-                f"{data_dir} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('matinf', data_dir=...)` that includes SAT-package-V3.txt. Manual download instructions: {self.manual_download_instructions}"
-            )
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "SAT-package-V3.txt"),
-                },
-            )
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath):
-        data = []
-        with open(filepath, "r", encoding="utf-8") as f:
-            record = []
-            for line in f:
-                line = line.strip()
-                if len(line) == 0 and record:
-                    data.append(record)
-                    record = []
-                elif len(line) > 0 and line[0] == "#":
-                    # Skip comments.
-                    continue
-                else:
-                    record.append(line)
-            data.append(record)
-        for key, record in enumerate(data):
-            source = record[-8]
-            stem = record[-7]
-            choices = record[-6:-1]
-            solution = record[-1]
-            yield key, {
-                "source": source,
-                "stem": stem,
-                "choices": choices,
-                "solution": solution,
-            }
--- a/lm_eval/datasets/unscramble/__init__.py
+++ b/lm_eval/datasets/unscramble/__init__.py
--- a/lm_eval/datasets/unscramble/dataset_infos.json
+++ b/lm_eval/datasets/unscramble/dataset_infos.json
-{"mid_word_1_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_1_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 271516, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_1_anagrams.jsonl.gz": {"num_bytes": 106533, "checksum": "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"}}, "download_size": 106533, "post_processing_size": null, "dataset_size": 271516, "size_in_bytes": 378049}, "mid_word_2_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_2_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_2_anagrams.jsonl.gz": {"num_bytes": 109091, "checksum": "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"}}, "download_size": 109091, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 391745}, "cycle_letters_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "cycle_letters_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/cycle_letters_in_word.jsonl.gz": {"num_bytes": 98451, "checksum": "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"}}, "download_size": 98451, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 381105}, "random_insertion_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "random_insertion_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 353981, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/random_insertion_in_word.jsonl.gz": {"num_bytes": 143626, "checksum": "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"}}, "download_size": 143626, "post_processing_size": null, "dataset_size": 353981, "size_in_bytes": 497607}, "reversed_words": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "reversed_words", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/reversed_words.jsonl.gz": {"num_bytes": 91917, "checksum": "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"}}, "download_size": 91917, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 374571}}
--- a/lm_eval/datasets/unscramble/unscramble.py
+++ b/lm_eval/datasets/unscramble/unscramble.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Unscramble dataset."""
-
-
-import json
-import os
-
-import datasets
-
-
-_CITATION = """\
-@inproceedings{NEURIPS2020_1457c0d6,
-    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
-    booktitle = {Advances in Neural Information Processing Systems},
-    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
-    pages = {1877--1901},
-    publisher = {Curran Associates, Inc.},
-    title = {Language Models are Few-Shot Learners},
-    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
-    volume = {33},
-    year = {2020}
-}
-"""
-
-_DESCRIPTION = """\
-Unscramble is a small battery of 5 “character manipulation” tasks. Each task
-involves giving the model a word distorted by some combination of scrambling,
-addition, or deletion of characters, and asking it to recover the original word.
-"""
-
-_HOMEPAGE = "https://github.com/openai/gpt-3/tree/master/data"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-_BASE_URL = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
-
-
-_DESCRIPTIONS = {
-    "mid_word_1_anagrams": "Anagrams of all but the first and last letter.",
-    "mid_word_2_anagrams": "Anagrams of all but the first and last 2 letters.",
-    "cycle_letters_in_word": "Cycle letters in the word.",
-    "random_insertion_in_word": "Random insertions in the word that must be removed.",
-    "reversed_words": "Words spelled backwards that must be reversed.",
-}
-_NAMES = _DESCRIPTIONS.keys()
-
-
-class Unscramble(datasets.GeneratorBasedBuilder):
-    """Unscramble is a small battery of 5 “character manipulation” tasks."""
-
-    VERSION = datasets.Version("0.0.1")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name=name, version=version, description=_DESCRIPTIONS[name]
-        )
-        for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "context": datasets.Value("string"),
-                "completion": datasets.Value("string"),
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = os.path.join(_BASE_URL, f"{self.config.name}.jsonl.gz")
-        data_dir = dl_manager.download_and_extract(urls)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir,
-                    "split": "validation",
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with open(filepath, encoding="utf-8") as f:
-            for key, row in enumerate(f):
-                data = json.loads(row)
-                yield key, {
-                    "context": data["context"],
-                    "completion": data["completion"],
-                }
--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
-import os
-import zstandard
-import json
-import jsonlines
-import io
 import datetime
+import io
+import json
 import mmap
-import tqdm
+import os
 from pathlib import Path
+from typing import Any
+
+import jsonlines
+import tqdm
+import zstandard


-def json_serial(obj):
+def json_serial(obj: Any) -> str:
    """JSON serializer for objects not serializable by default json code"""

    if isinstance(obj, (datetime.datetime,)):
@@ -19,7 +21,7 @@ def json_serial(obj):

 # Modified version of lm_dataformat Archive for single file.
 class Archive:
-    def __init__(self, file_path, compression_level=3):
+    def __init__(self, file_path: str, compression_level: int = 3) -> None:
        self.file_path = file_path
        dir_name = os.path.dirname(file_path)
        if dir_name:
@@ -28,7 +30,9 @@ class Archive:
        self.cctx = zstandard.ZstdCompressor(level=compression_level)
        self.compressor = self.cctx.stream_writer(self.fh)

-    def add_data(self, data, meta={}):
+    def add_data(self, data, meta=None) -> None:
+        if meta is None:
+            meta = {}
        self.compressor.write(
            json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
                "UTF-8"
@@ -36,7 +40,7 @@ class Archive:
            + b"\n"
        )

-    def commit(self):
+    def commit(self) -> None:
        self.compressor.flush(zstandard.FLUSH_FRAME)
        self.fh.flush()
        self.fh.close()
@@ -44,10 +48,16 @@ class Archive:

 # Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
 class Reader:
-    def __init__(self):
+    def __init__(self) -> None:
        pass

-    def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"):
+    def read(
+        self,
+        file,
+        get_meta: bool = False,
+        autojoin_paragraphs: bool = True,
+        para_joiner: str = "\n\n",
+    ):
        with open(file, "rb") as fh:
            self.fh = fh
            cctx = zstandard.ZstdDecompressor()
@@ -72,7 +82,7 @@ class Reader:


 class TextArchive:
-    def __init__(self, file_path, mode="rb+"):
+    def __init__(self, file_path, mode: str = "rb+") -> None:
        self.file_path = file_path
        dir_name = os.path.dirname(file_path)
        if dir_name:
@@ -83,24 +93,24 @@ class TextArchive:

        self.fh = open(self.file_path, mode)

-    def add_data(self, data):
+    def add_data(self, data) -> None:
        self.fh.write(data.encode("UTF-8") + b"\n")

-    def commit(self):
+    def commit(self) -> None:
        self.fh.flush()
        self.fh.close()


 class TextReader:
-    def __init__(self, file_path):
+    def __init__(self, file_path) -> None:
        self.file_path = file_path

    # Optimized mmap read with infrequent tqdm updates to maintain speed
    # Tested up to 250MB/s.
-    def read_tqdm(self, update_frequency=10000):
+    def read_tqdm(self, update_frequency: int = 10000):
        current_file_position = 0
        line_counter = 0
-        with open(self.file_path, "r") as fh, tqdm.tqdm(
+        with open(self.file_path, "r", encoding="utf-8") as fh, tqdm.tqdm(
            total=os.path.getsize(self.file_path),
            dynamic_ncols=True,
            unit="byte",
@@ -149,7 +159,7 @@ class TextReader:
 # Optimized for speed. Decompresses the archive in shell before
 # using the mmap'd TextReader.
 class ZStdTextReader:
-    def __init__(self, file):
+    def __init__(self, file) -> None:
        self.file = file

    def read_tqdm(self):

--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
-import time
-import random
-import pickle
-import json
+import collections
 import glob
+import json
 import os
-import collections
+import pickle
+import random
+import time

-from .janitor import Janitor, word_ngrams
 from .archiver import ZStdTextReader
+from .janitor import Janitor, word_ngrams


 # Was used for testing the evaluator decoupled from the full logic below
-def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
+def get_train_overlap_stub(docs: dict, ngrams_path: str, ngrams_n_size: str):
    simulated_overlap = 0.1
    contaminated = int(len(docs) * simulated_overlap)
    return random.sample(range(len(docs)), contaminated)
@@ -25,6 +25,7 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
 # scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
 # files. These should exist in the "ngrams_path" provided to this function.

+
 # Algorithm:
 # 1. Build lookups for each dataset {ngram: list(document_ids)}
 # 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
@@ -33,11 +34,11 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
 # 4. Strip the task_set from the dictionary keys and return
 #
 # We cache the task+set lookups as well as the overlaps.
-def get_train_overlap(docs_by_task_set, ngrams_path, limit):
+def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> dict:
    # return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)

    info_dict_path = os.path.join(ngrams_path, "info.json")
-    info_dict = json.load(open(info_dict_path, "r"))
+    info_dict = json.load(open(info_dict_path, "r", encoding="utf-8"))
    ngrams_n_size = info_dict["ngram_size"]

    janitor = Janitor()
@@ -46,7 +47,7 @@ def get_train_overlap(docs_by_task_set, ngrams_path, limit):
    print("Building Lookups...")
    start = time.perf_counter()

-    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
+    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
        return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"

    lookups = {}
@@ -108,7 +109,7 @@ def get_train_overlap(docs_by_task_set, ngrams_path, limit):
        print(f"Merging lookups took {elapsed:0.5f} seconds.")

        print(f"{ngrams_n_size} grams files found in {ngrams_path}:")
-        files = glob.glob(os.path.join(ngrams_path, f"*.sorted.zst"))
+        files = glob.glob(os.path.join(ngrams_path, "*.sorted.zst"))
        print(files)

        for file in files:
@@ -134,11 +135,7 @@ def get_train_overlap(docs_by_task_set, ngrams_path, limit):
                        matching_unique += 1
                        for task_name, task_set, doc_ids in merged_lookup[ngram]:
                            task_doc_set = duplicates[(task_name, task_set)]
-                            for (
-                                doc_id
-                            ) in (
-                                doc_ids
-                            ):  # Record contamination across all relevant task/set combos
+                            for doc_id in doc_ids:  # Record contamination across all relevant task/set combos
                                task_doc_set.add(doc_id)
                        del merged_lookup[ngram]  # No point matching again
                    else:

--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
+import pickle
 import re
 import string
-import timeit
-import pickle
 import traceback
-from pprint import pprint
+from typing import Iterator, List, Sequence, Tuple, TypeVar
+

 # This is a cpp module. Compile janitor_util.cpp with:
 # c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
@@ -16,10 +16,12 @@ except Exception:
    traceback.print_exc()
    JANITOR_CPP = False

+T = TypeVar("T")
+

 # Implementation from nltk source
 # https://www.nltk.org/_modules/nltk/util.html
-def form_ngrams(sequence, n):
+def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
    history = []
    while n > 1:
        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
@@ -36,7 +38,7 @@ def form_ngrams(sequence, n):
        del history[0]


-def word_ngrams(s, n):
+def word_ngrams(s: str, n: int) -> Iterator[str]:
    """Splits a string into ngram words"""
    tokens = s.split()  # not a generator :(
    ngram_seqs = form_ngrams(iter(tokens), n)
@@ -68,14 +70,14 @@ def word_ngrams(s, n):


 # https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
-def split_indices(s):
+def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
    """Splits a string on whitespaces and records the indices of each in the original string.
    @:return generator((word, (start_idx, end_idx)), ...)
    """
    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))


-def word_ngrams_indices(s, n):
+def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
    """Splits a string into pairs of (ngram words, their start/end indices)"""
    tokens_with_indices = split_indices(s)

@@ -104,16 +106,15 @@ def word_ngrams_indices(s, n):


 class Janitor:
-
    # FIXME delete_chars: Should anything else go here? Special chars?
    def __init__(
        self,
-        ngram_n=13,
-        window_to_remove=200,
-        too_dirty_cutoff=10,
-        minimum_slice_length=200,
-        delete_chars=string.punctuation,
-    ):
+        ngram_n: int = 13,
+        window_to_remove: int = 200,
+        too_dirty_cutoff: int = 10,
+        minimum_slice_length: int = 200,
+        delete_chars: str = string.punctuation,
+    ) -> None:
        self.ngram_n = ngram_n
        self.window_to_remove = window_to_remove
        self.too_dirty_cutoff = too_dirty_cutoff
@@ -135,11 +136,11 @@ class Janitor:
    # I/O for saving contamination ngrams
    ##############

-    def save_contamination_ngrams(self, filename):
+    def save_contamination_ngrams(self, filename: str) -> None:
        with open(filename, "wb") as fp:
            pickle.dump(filename, fp)

-    def load_contamination_ngrams(self, filename):
+    def load_contamination_ngrams(self, filename: str) -> None:
        with open(filename, "rb") as fp:
            self.dirt_ngrams = pickle.load(fp)

@@ -147,7 +148,7 @@ class Janitor:
    # Call these :)
    ##############

-    def register_contaminant(self, dirt_string):
+    def register_contaminant(self, dirt_string: str) -> None:
        """Register a string as contamination to be removed, e.g. a test set
        This breaks the dirt_string into ngrams to store for future cleaning"""
        if JANITOR_CPP:
@@ -156,7 +157,7 @@ class Janitor:
            print("WARNING: Janitor running in python mode")
            return self.register_contaminant_python(dirt_string)

-    def clean(self, dirty_string):
+    def clean(self, dirty_string: str) -> List[str]:
        """Clean a string (e.g. a training set) by removing all ngrams previously
        registered as contaminants. Returns a list of clean chunks, or empty if
        the string was too dirty"""
@@ -166,7 +167,9 @@ class Janitor:
            print("WARNING: Janitor running in python mode")
            return self.clean_python(dirty_string)

-    def _split_chunks(self, dirty_string, dirty_parts):
+    def _split_chunks(
+        self, dirty_string: str, dirty_parts: Sequence[Tuple]
+    ) -> List[str]:
        clean_chunks = []
        splice_idx = 0
        end = -1
@@ -189,12 +192,12 @@ class Janitor:
    # Fast C++
    ##############

-    def register_contaminant_cpp(self, dirt_string):
+    def register_contaminant_cpp(self, dirt_string) -> None:
        self.dirt_ngrams.update(
            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
        )

-    def clean_cpp(self, dirty_string):
+    def clean_cpp(self, dirty_string: str) -> List[str]:
        contamination_indices = janitor_util.clean_ngram_with_indices(
            dirty_string, self.delete_chars, self.ngram_n
        )
@@ -204,15 +207,15 @@ class Janitor:
    # Slow python
    ##############

-    def normalize_string(self, s):
+    def normalize_string(self, s: str) -> str:
        return s.translate(self.translation_table)

-    def register_contaminant_python(self, dirt_string):
+    def register_contaminant_python(self, dirt_string: str) -> None:
        self.dirt_ngrams.update(
            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
        )

-    def clean_python(self, dirty_string):
+    def clean_python(self, dirty_string: str) -> List[str]:
        contamination_indices = (
            (None, *idx_pair)
            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
-import collections
 import itertools
-import numpy as np
+import json
+import logging
 import random
-import lm_eval.metrics
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import numpy as np
+import torch
+
+import lm_eval.api.metrics
+import lm_eval.api.registry
 import lm_eval.models
-import lm_eval.tasks
-import lm_eval.base
-from lm_eval.utils import positional_deprecated, run_task_tests
+from lm_eval.caching.cache import delete_cache
+from lm_eval.evaluator_utils import (
+    consolidate_results,
+    get_sample_size,
+    get_task_list,
+    prepare_print_tasks,
+    print_writeout,
+    run_task_tests,
+)
+from lm_eval.logging.utils import add_env_info, get_git_commit_hash
+from lm_eval.tasks import TaskManager, get_task_dict
+from lm_eval.utils import (
+    eval_logger,
+    handle_non_serializable,
+    hash_string,
+    positional_deprecated,
+    simple_parse_args_string,
+)
+
+
+if TYPE_CHECKING:
+    from lm_eval.api.model import LM
+    from lm_eval.tasks import Task


 @positional_deprecated
 def simple_evaluate(
    model,
-    model_args=None,
-    tasks=[],
-    num_fewshot=0,
-    batch_size=None,
-    max_batch_size=None,
-    device=None,
-    no_cache=False,
-    limit=None,
-    bootstrap_iters=100000,
-    description_dict=None,
-    check_integrity=False,
-    decontamination_ngrams_path=None,
-    write_out=False,
-    output_base_path=None,
-    log_samples=True,
-    gen_kwargs=None,
+    model_args: Optional[Union[str, dict]] = None,
+    tasks: Optional[List[Union[str, dict, object]]] = None,
+    num_fewshot: Optional[int] = None,
+    batch_size: Optional[int] = None,
+    max_batch_size: Optional[int] = None,
+    device: Optional[str] = None,
+    use_cache: Optional[str] = None,
+    cache_requests: bool = False,
+    rewrite_requests_cache: bool = False,
+    delete_requests_cache: bool = False,
+    limit: Optional[Union[int, float]] = None,
+    bootstrap_iters: int = 100000,
+    check_integrity: bool = False,
+    write_out: bool = False,
+    log_samples: bool = True,
+    gen_kwargs: Optional[str] = None,
+    task_manager: Optional[TaskManager] = None,
+    verbosity: str = "INFO",
+    predict_only: bool = False,
+    random_seed: int = 0,
+    numpy_random_seed: int = 1234,
+    torch_random_seed: int = 1234,
+    fewshot_random_seed: int = 1234,
 ):
    """Instantiate and evaluate a model on a list of tasks.

    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
-    :param model_args: Optional[str]
-        String arguments for each model class, see LM.create_from_arg_string.
+    :param model_args: Optional[str, dict]
+        String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object.
        Ignored if `model` argument is a LM object.
-    :param tasks: list[Union[str, Task]]
+    :param tasks: list[Union[str, dict, Task]]
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
    :param num_fewshot: int
        Number of examples in few-shot context
@@ -46,49 +81,174 @@ def simple_evaluate(
        Maximal batch size to try with automatic batch size detection
    :param device: str, optional
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
-    :param no_cache: bool
-        Whether or not to cache
+    :param use_cache: str, optional
+        A path to a sqlite db file for caching model responses. `None` if not caching.
+    :param cache_requests: bool, optional
+        Speed up evaluation by caching the building of dataset requests. `None` if not caching.
+    :param rewrite_requests_cache: bool, optional
+        Rewrites all of the request cache if set to `True`. `None` if not desired.
+    :param delete_requests_cache: bool, optional
+        Deletes all of the request cache if set to `True`. `None` if not desired.
    :param limit: int or float, optional
        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
-    :param description_dict: dict[str, str]
-        Dictionary of custom task descriptions of the form: `task_name: description`
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
    :param write_out: bool
-        If True, write details about prompts and logits to json for all tasks
-    :param output_base_path: str, optional
-        Directory to which detailed eval info will be written. Defaults to present working dir.
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param gen_kwargs: str
+        String arguments for model generation
+        Ignored for all tasks with loglikelihood output_type
+    :param predict_only: bool
+        If true only model outputs will be generated and returned. Metrics will not be evaluated
+    :param random_seed: int
+        Random seed for python's random module. If set to None, the seed will not be set.
+    :param numpy_random_seed: int
+        Random seed for numpy. If set to None, the seed will not be set.
+    :param torch_random_seed: int
+        Random seed for torch. If set to None, the seed will not be set.
+    :param fewshot_random_seed: int
+        Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
+
    :return
        Dictionary of results
    """
-    random.seed(1234)
-    np.random.seed(1234)
+    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+    start_date = time.time()
+
+    if delete_requests_cache:
+        eval_logger.info("Deleting requests cache...")
+        delete_cache()
+
+    seed_message = []
+    if random_seed is not None:
+        # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
+        seed_message.append(f"Setting random seed to {random_seed}")
+        random.seed(random_seed)
+
+    if numpy_random_seed is not None:
+        seed_message.append(f"Setting numpy seed to {numpy_random_seed}")
+        np.random.seed(numpy_random_seed)
+
+    if torch_random_seed is not None:
+        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
+        torch.manual_seed(torch_random_seed)
+
+    if seed_message:
+        eval_logger.info(" | ".join(seed_message))
+
+    if tasks is None:
+        tasks = []
+    if len(tasks) == 0:
+        raise ValueError(
+            "No tasks specified, or no tasks found. Please verify the task names."
+        )

-    assert tasks != [], "No tasks specified"
+    if gen_kwargs is not None:
+        gen_kwargs = simple_parse_args_string(gen_kwargs)
+        eval_logger.warning(
+            "generation_kwargs specified through cli, these settings will update set parameters in yaml tasks. "
+            "Ensure 'do_sample=True' for non-greedy decoding!"
+        )
+        if gen_kwargs == "":
+            gen_kwargs = None

    if isinstance(model, str):
        if model_args is None:
+            eval_logger.warning("model_args not specified. Using defaults.")
            model_args = ""
-        lm = lm_eval.models.get_model(model).create_from_arg_string(
-            model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device}
+
+        if isinstance(model_args, dict):
+            eval_logger.info(
+                f"Initializing {model} model, with arguments: {model_args}"
+            )
+            lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
+                model_args,
+                {
+                    "batch_size": batch_size,
+                    "max_batch_size": max_batch_size,
+                    "device": device,
+                },
            )
+
        else:
-        assert isinstance(model, lm_eval.base.LM)
+            eval_logger.info(
+                f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
+            )
+            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
+                model_args,
+                {
+                    "batch_size": batch_size,
+                    "max_batch_size": max_batch_size,
+                    "device": device,
+                },
+            )
+    else:
+        if not isinstance(model, lm_eval.api.model.LM):
+            raise TypeError
+        eval_logger.info("Using pre-initialized model")
        lm = model

-    if not no_cache:
-        lm = lm_eval.base.CachingLM(
+    if use_cache is not None:
+        eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
+        lm = lm_eval.api.model.CachingLM(
            lm,
-            "lm_cache/"
-            + (model if isinstance(model, str) else model.model.config._name_or_path)
-            + "_"
-            + model_args.replace("=", "-").replace(",", "_").replace("/", "-")
+            use_cache
+            # each rank receives a different cache db.
+            # necessary to avoid multiple writes to cache at once
+            + "_rank"
+            + str(lm.rank)
            + ".db",
        )

-    task_dict = lm_eval.tasks.get_task_dict(tasks)
+    if task_manager is None:
+        task_manager = TaskManager(verbosity)
+
+    task_dict = get_task_dict(tasks, task_manager)
+    for task_name in task_dict.keys():
+        task_obj = task_dict[task_name]
+        if isinstance(task_obj, tuple):
+            _, task_obj = task_obj
+            if task_obj is None:
+                continue
+
+        if task_obj.get_config("output_type") == "generate_until":
+            if gen_kwargs is not None:
+                task_obj.set_config(
+                    key="generation_kwargs", value=gen_kwargs, update=True
+                )
+
+        if predict_only:
+            log_samples = True
+            eval_logger.info(
+                f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
+            )
+            # we have to change the class properties post-hoc. This is pretty hacky.
+            task_obj.override_metric(metric_name="bypass")
+
+        # override tasks' fewshot values to the provided num_fewshot arg value
+        # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
+        if num_fewshot is not None:
+            if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
+                eval_logger.info(
+                    f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
+                )
+            else:
+                eval_logger.warning(
+                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                )
+                task_obj.set_config(key="num_fewshot", value=num_fewshot)
+            task_obj.set_fewshot_seed(seed=fewshot_random_seed)
+            eval_logger.info(
+                f"Setting fewshot random generator seed to {fewshot_random_seed}"
+            )
+        else:
+            # if num_fewshot not provided, and the task does not define a default one, default to 0
+            if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
+                task_obj.set_config(key="num_fewshot", value=0)

    if check_integrity:
        run_task_tests(task_list=tasks)
@@ -96,316 +256,366 @@ def simple_evaluate(
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
-        num_fewshot=num_fewshot,
        limit=limit,
+        cache_requests=cache_requests,
+        rewrite_requests_cache=rewrite_requests_cache,
        bootstrap_iters=bootstrap_iters,
-        description_dict=description_dict,
-        decontamination_ngrams_path=decontamination_ngrams_path,
        write_out=write_out,
-        output_base_path=output_base_path,
+        log_samples=log_samples,
+        verbosity=verbosity,
    )

+    if lm.rank == 0:
+        if isinstance(model, str):
+            model_name = model
+        elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
+            model_name = model.config._name_or_path
+        else:
+            model_name = type(model).__name__
+
        # add info about the model and few shot config
        results["config"] = {
-        "model": (model if isinstance(model, str) else model.model.config._name_or_path),
+            "model": model_name,
            "model_args": model_args,
-        "num_fewshot": num_fewshot,
+        }
+        # add more detailed model info if available
+        if isinstance(lm, lm_eval.models.huggingface.HFLM):
+            results["config"].update(lm.get_model_info())
+        # add info about execution
+        results["config"].update(
+            {
                "batch_size": batch_size,
-        "batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [],
+                "batch_sizes": (
+                    list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
+                ),
                "device": device,
-        "no_cache": no_cache,
+                "use_cache": use_cache,
                "limit": limit,
                "bootstrap_iters": bootstrap_iters,
-        "description_dict": description_dict,
+                "gen_kwargs": gen_kwargs,
+                "random_seed": random_seed,
+                "numpy_seed": numpy_random_seed,
+                "torch_seed": torch_random_seed,
+                "fewshot_seed": fewshot_random_seed,
            }
-
+        )
+        results["git_hash"] = get_git_commit_hash()
+        results["date"] = start_date
+        add_env_info(results)  # additional environment info to results
        return results
-
-
-decontaminate_suffix = "_decontaminate"
+    else:
+        return None


 @positional_deprecated
 def evaluate(
-    lm,
+    lm: "LM",
    task_dict,
-    provide_description=None,
-    num_fewshot=0,
-    limit=None,
-    bootstrap_iters=100000,
-    description_dict=None,
-    decontamination_ngrams_path=None,
-    write_out=False,
-    output_base_path=None,
+    limit: Optional[int] = None,
+    cache_requests: bool = False,
+    rewrite_requests_cache: bool = False,
+    bootstrap_iters: Optional[int] = 100000,
+    write_out: bool = False,
+    log_samples: bool = True,
+    verbosity: str = "INFO",
 ):
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
-        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
-    :param provide_description: bool
-        Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
-    :param num_fewshot: int
-        Number of examples in few-shot context
+        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
-    :param description_dict: dict[str, str]
-        Dictionary of custom task descriptions of the form: `task_name: description`
    :param write_out: bool
-        If True, write all prompts, logits and metrics to json for offline analysis
-    :param output_base_path: str, optional
-        Directory to which detailed eval info will be written. Defaults to present working dir
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
    :return
        Dictionary of results
    """
-    # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
-
-    # TODO: todo: implement proper description-providing system
-    assert not provide_description  # not implemented.
-    if provide_description is not None:
-        # nudge people to not specify it at all
-        print(
-            "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
-        )
-
-    decontaminate = decontamination_ngrams_path is not None
-
-    task_dict_items = [
-        (name, task)
-        for name, task in task_dict.items()
-        if (task.has_validation_docs() or task.has_test_docs())
-    ]
-
-    results = collections.defaultdict(dict)
-    versions = collections.defaultdict(dict)
-
-    requests = collections.defaultdict(list)
-    requests_origin = collections.defaultdict(list)
-
-    overlaps = collections.defaultdict(list)  # {task_name: contaminated_docs}
-
-    # If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
-    # memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
-    # over-engineering is bad (or we could make it write the requests to disk and then read them back out again
-    #  - probably using an sqlite db because of all the moving parts we have
-
-    # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
-    docs = {}
-    write_out_info = {}
-
-    docs_for_decontamination = collections.defaultdict(list)
-
-    # get lists of each type of request
-    for task_name, task in task_dict_items:
-        versions[task_name] = task.VERSION
-        # default to test doc, fall back to val doc if validation unavailable
-        # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
-        if task.has_test_docs():
-            task_doc_func = task.test_docs
-            task_set = "test"  # Required for caching in the decontamination
-        elif task.has_validation_docs():
-            task_set = "val"  # Required for caching in the decontamination
-            task_doc_func = task.validation_docs
-        else:
-            raise RuntimeError("Task has neither test_docs nor validation_docs")
-
-        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
-        task_docs = list(task_doc_func())
-        rnd = random.Random()
-        rnd.seed(42)
-        rnd.shuffle(task_docs)
-        print(f"Task: {task_name}; number of docs: {len(task_docs)}")

-        if write_out:
-            prompt_details = []
-
-        description = (
-            description_dict[task_name]
-            if description_dict and task_name in description_dict
-            else ""
-        )
-        if limit is not None:
-            limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
-
-        for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
-            if decontaminate and task.should_decontaminate():
-                docs_for_decontamination[(task_name, task_set)].append(
-                    task.doc_to_decontamination_query(doc)
+    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+
+    # tracks all Instances/requests a model must generate output on.
+    requests = defaultdict(list)
+    # stores the amount to pad out reqs per req. type so that
+    # number of fwd passes per distributed rank is equal
+    padding_requests = defaultdict(int)
+
+    # get lists of group hierarchy and each type of request
+    task_hierarchy, eval_tasks = get_task_list(task_dict)
+    if not log_samples:
+        if not all(
+            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
+            for task_output in eval_tasks
+        ):
+            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+        limit = get_sample_size(task, limit)
+        task.build_all_requests(
+            limit=limit,
+            rank=lm.rank,
+            world_size=lm.world_size,
+            cache_requests=cache_requests,
+            rewrite_requests_cache=rewrite_requests_cache,
        )
-
-            docs[(task_name, doc_id)] = doc
-            ctx = task.fewshot_context(
-                doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
+        eval_logger.debug(
+            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
        )
-            reqs = task.construct_requests(doc, ctx)
-
        if write_out:
-                prompt_details.append({"doc_id": doc_id})
-
-            # print the prompt for the first few documents
-            if doc_id < 1:
-                print(
-                    f"Task: {task_name}; document {doc_id}; context prompt (starting on next line):\n{ctx}\n(end of prompt on previous line)"
+            print_writeout(task)
+        # aggregate Instances by LM method requested to get output.
+        for instance in task.instances:
+            reqtype = instance.request_type
+            requests[reqtype].append(instance)
+
+        if lm.world_size > 1:
+            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
+            gathered_item = (
+                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
            )
-                print("Requests:", reqs)
-
-            if not isinstance(reqs, (list, tuple)):
-                reqs = [reqs]
-            for i, req in enumerate(reqs):
-                requests[req.request_type].append(req)
-                # i: index in requests for a single task instance
-                # doc_id: unique id that we can get back to a doc using `docs`
-                requests_origin[req.request_type].append((i, task_name, doc, doc_id))
-
-                if write_out:
-                    prompt_details[-1][f"prompt_{i}"] = "".join(
-                        (map(lambda x: "".join(x), req.args))
+            # "multiple_choice" task types dispatch (several) "loglikelihood" request types
+            reqtype = (
+                "loglikelihood"
+                if task.OUTPUT_TYPE == "multiple_choice"
+                else task.OUTPUT_TYPE
            )
+            # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks)
+            numpad = max(gathered_item) - gathered_item[lm.rank]
+            # todo: may not account for padding in cases like SquadV2 which has multiple req types
+            padding_requests[reqtype] += numpad

-        if write_out:
-            write_out_info[task_name] = prompt_details
-
-    # Compare all tasks/sets at once to ensure a single training set scan
-    if decontaminate:
-        from lm_eval.decontamination.decontaminate import get_train_overlap
-
-        print("Finding train/test overlap, please wait...")
-        overlaps = get_train_overlap(
-            docs_for_decontamination, decontamination_ngrams_path, limit
-        )
-
-    # all responses for each (task, doc)
-    process_res_queue = collections.defaultdict(list)
-
+    ### Run LM on inputs, get all outputs ###
    # execute each type of request
    for reqtype, reqs in requests.items():
-        # TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
-        #       only in index. We could implement some kind of caching, but that would be more of a band-aid
-        #       solution. we could also implement some kind of auto-grouping here;
-        #       they should end up next to each other.
-
-        print("Running", reqtype, "requests")
-        resps = getattr(lm, reqtype)([req.args for req in reqs])
-        resps = [
-            x if req.index is None else x[req.index] for x, req in zip(resps, reqs)
-        ]
-
-        for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
-            process_res_queue[(task_name, doc_id)].append((i, resp))
-
-            if write_out:
-                write_out_info[task_name][doc_id][f"logit_{i}"] = resp
-                task = task_dict[task_name]
-                if isinstance(task, lm_eval.base.MultipleChoiceTask):
-                    write_out_info[task_name][doc_id]["truth"] = doc["gold"]
-                elif isinstance(task, lm_eval.tasks.winogrande.Winogrande):
-                    write_out_info[task_name][doc_id]["truth"] = task.answer_to_num[
-                        doc["answer"]
-                    ]
-                else:
-                    write_out_info[task_name][doc_id]["truth"] = task.doc_to_target(doc)
-
-    vals = collections.defaultdict(list)
-
-    # unpack results and sort back in order and return control to Task
-    for (task_name, doc_id), requests in process_res_queue.items():
-        requests.sort(key=lambda x: x[0])
-        requests = [x[1] for x in requests]
-
-        task = task_dict[task_name]
-        doc = docs[(task_name, doc_id)]
-
-        metrics = task.process_results(doc, requests)
+        eval_logger.info(f"Running {reqtype} requests")
+        # create `K` copies of each request `req` based off `K = req.repeats`
+        cloned_reqs = []
+        for req in reqs:
+            cloned_reqs.extend([req] * req.repeats)
+
+        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
+            for _ in range(padding_requests[reqtype]):
+                cloned_reqs.extend([req] * req.repeats)
+
+        # run requests through model
+        resps = getattr(lm, reqtype)(cloned_reqs)
+
+        # put responses from model into a list of length K for each request.
+        for x, req in zip(resps, cloned_reqs):
+            req.resps.append(x)
+
+        if lm.world_size > 1:
+            lm.accelerator.wait_for_everyone()
+
+    RANK = lm.rank
+    WORLD_SIZE = lm.world_size
+    ### Postprocess outputs ###
+    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
+    for task_output in eval_tasks:
+        task = task_output.task
+        task.apply_filters()
+
+        ### Collect values of metrics on all datapoints ###
+        # # unpack results and sort back in order and return control to Task
+        # TODO: make it possible to use a different metric per filter
+        # Pre-process task.instances to group by doc_id
+        instances_by_doc_id = defaultdict(list)
+        for instance in task.instances:
+            instances_by_doc_id[instance.doc_id].append(instance)
+        # Sort instances within each group
+        for instances in instances_by_doc_id.values():
+            instances.sort(key=lambda x: x.idx)
+        # iterate over different filters used
+        for filter_key in task.instances[0].filtered_resps.keys():
+            doc_iterator = task.doc_iterator(
+                rank=RANK, limit=limit, world_size=WORLD_SIZE
+            )
+            for doc_id, doc in doc_iterator:
+                requests = instances_by_doc_id[doc_id]
+                metrics = task.process_results(
+                    doc, [req.filtered_resps[filter_key] for req in requests]
+                )
+                if log_samples:
+                    target = task.doc_to_target(doc)
+                    example = {
+                        "doc_id": doc_id,
+                        "doc": doc,
+                        "target": target,
+                        "arguments": [req.args for req in requests],
+                        "resps": [req.resps for req in requests],
+                        "filtered_resps": [
+                            req.filtered_resps[filter_key] for req in requests
+                        ],
+                        "doc_hash": hash_string(
+                            json.dumps(
+                                requests[0].doc,
+                                indent=2,
+                                default=handle_non_serializable,
+                                ensure_ascii=False,
+                            )
+                        ),
+                        "prompt_hash": hash_string(requests[0].arguments[0]),
+                        "target_hash": hash_string(str(target)),
+                    }
+                    example.update(metrics)
+                    task_output.logged_samples.append(example)
                for metric, value in metrics.items():
-            vals[(task_name, metric)].append(value)
-
-            if write_out:
-                write_out_info[task_name][doc_id][metric] = str(value)
-
-            # Re-use the evaluation for the decontaminated set by just ignoring the overlaps
-            if decontaminate and task_name in overlaps:
-                if doc_id not in overlaps[task_name]:
-                    vals[(task_name, metric + decontaminate_suffix)].append(value)
-
-    # aggregate results
-    for (task_name, metric), items in vals.items():
-        task = task_dict[task_name]
-        real_metric = metric  # key when looking up the metric with task.aggregation
-        if metric.endswith(decontaminate_suffix):
-            real_metric = metric.replace(
-                decontaminate_suffix, ""
-            )  # decontaminated still uses the same metric
-        results[task_name][metric] = task.aggregation()[real_metric](items)
-
-        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
-        # so we run them less iterations. still looking for a cleaner way to do this
-
-        stderr = lm_eval.metrics.stderr_for_metric(
-            metric=task.aggregation()[real_metric],
-            bootstrap_iters=min(bootstrap_iters, 1000)
-            if metric in ["bleu", "chrf", "ter"]
-            else bootstrap_iters,
+                    task_output.sample_metrics[(metric, filter_key)].append(value)
+
+    if WORLD_SIZE > 1:
+        # if multigpu, then gather data across all ranks to rank 0
+        # first gather logged samples across all ranks
+        for task_output in eval_tasks:
+            if log_samples:
+                # for task_name, task_samples in list(samples.items()):
+                full_samples = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.logged_samples,
+                    object_gather_list=full_samples,
+                    dst=0,
                )

-        if stderr is not None:
-            results[task_name][metric + "_stderr"] = stderr(items)
-
-    if write_out:
-        import json
-        import pathlib
-
-        output_base_path = (
-            pathlib.Path(output_base_path)
-            if output_base_path is not None
-            else pathlib.Path(".")
+                if RANK == 0:
+                    task_output.logged_samples = list(
+                        itertools.chain.from_iterable(full_samples)
                    )
-        try:
-            output_base_path.mkdir(parents=True, exist_ok=False)
-        except FileExistsError:
-            pass
-
-        for task_name, _ in task_dict_items:
-            with open(
-                output_base_path.joinpath(f"{task_name}_write_out_info.json"),
-                "w",
-                encoding="utf8",
-            ) as fp:
-                json.dump(write_out_info[task_name], fp, indent=4, ensure_ascii=False)
-
-    return {"results": dict(results), "versions": dict(versions)}

+            # then collect metrics across all ranks
+            for metrics in task_output.sample_metrics:
+                metric_list = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.sample_metrics[metrics],
+                    object_gather_list=metric_list,
+                    dst=0,
+                )
+                if RANK == 0:
+                    task_output.sample_metrics[metrics] = list(
+                        itertools.chain.from_iterable(metric_list)
+                    )

-def make_table(result_dict):
-    """Generate table of results."""
-    from pytablewriter import MarkdownTableWriter, LatexTableWriter
+    if RANK == 0:
+        ### Aggregate results over all datapoints ###
+        # aggregate results ; run bootstrap CIs
+        for task_output in eval_tasks:
+            task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
+        results, samples, configs, versions, num_fewshot = consolidate_results(
+            eval_tasks
+        )

-    md_writer = MarkdownTableWriter()
-    latex_writer = LatexTableWriter()
-    md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
-    latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
+        ### Calculate group metrics ###
+        if bool(results):
+            for group, task_list in reversed(task_hierarchy.items()):
+                if len(task_list) == 0:
+                    # task_hierarchy entries are either
+                    # `group_name: [subtask1, subtask2, ...]`
+                    # or `task_name: []`.
+                    # we only want to operate on groups here.
+                    continue
+                metric_list = list(
+                    {
+                        key
+                        for task in task_list
+                        for key in results[task].keys()
+                        if "_stderr" not in key and key not in ["alias", "samples"]
+                    }
+                )
+                for metric in metric_list:
+                    stderr = "_stderr,".join(metric.split(","))
+
+                    # gather metrics, sizes, and stderrs from subtasks
+                    metrics = [
+                        results[task][metric]
+                        for task in task_list
+                        if metric in results[task]
+                    ]  # TODO: copy?
+                    stderrs = [
+                        results[task][stderr]
+                        for task in task_list
+                        if stderr in results[task]
+                    ]
+                    sizes = [
+                        results[task]["samples"]
+                        for task in task_list
+                        if metric in results[task]
+                    ]

-    values = []
+                    # compute group's pooled metric and stderr
+                    results[group][
+                        metric
+                    ] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    # TODO: calculate grouped metric using aggregation fn
+                    if "N/A" in stderrs:
+                        results[group][stderr] = "N/A"
+                    else:
+                        results[group][
+                            stderr
+                        ] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
+                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
+                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
+
+                    results[group]["samples"] = sum(sizes)
+
+        results_agg = defaultdict(dict)
+        groups_agg = defaultdict(dict)
+        all_tasks_list = list(task_hierarchy.keys())
+        while True:
+            add_tasks_list = list(k for k in results_agg.keys())
+            left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
+            if len(left_tasks_list) == 0:
+                break
+
+            _task_hierarchy = {
+                k: v for k, v in task_hierarchy.items() if k in left_tasks_list
+            }
+            _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
+
+            results_agg = {**results_agg, **_results_agg}
+            groups_agg = {**groups_agg, **_groups_agg}
+
+        for group_name, task_list in task_hierarchy.items():
+            if task_list:
+                num_fewshot[group_name] = num_fewshot[
+                    task_list[0]
+                ]  # TODO: validate this
+
+        results_dict = {
+            "results": dict(results_agg.items()),
+            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
+            "group_subtasks": dict(reversed(task_hierarchy.items())),
+            "configs": dict(sorted(configs.items())),
+            "versions": dict(sorted(versions.items())),
+            "n-shot": dict(sorted(num_fewshot.items())),
+            "n-samples": {
+                task_output.task_name: {
+                    "original": len(task_output.task.eval_docs),
+                    "effective": min(
+                        limit if limit else len(task_output.task.eval_docs),
+                        len(task_output.task.eval_docs),
+                    ),
+                }
+                for task_output in eval_tasks
+            },
+        }
+        if log_samples:
+            results_dict["samples"] = dict(samples)

-    for k, dic in result_dict["results"].items():
-        version = result_dict["versions"][k]
-        for m, v in dic.items():
-            if m.endswith("_stderr"):
-                continue
+        return results_dict

-            if m + "_stderr" in dic:
-                se = dic[m + "_stderr"]
-                values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se])
    else:
-                values.append([k, version, m, "%.4f" % v, "", ""])
-            k = ""
-            version = ""
-    md_writer.value_matrix = values
-    latex_writer.value_matrix = values
+        return None
+

-    # todo: make latex table look good
-    # print(latex_writer.dumps())
+def request_caching_arg_to_dict(cache_requests: str) -> dict:
+    request_caching_args = {
+        "cache_requests": cache_requests in {"true", "refresh"},
+        "rewrite_requests_cache": cache_requests == "refresh",
+        "delete_requests_cache": cache_requests == "delete",
+    }

-    return md_writer.dumps()
+    return request_caching_args
--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
+import collections
+import math
+import pathlib
+import sys
+from typing import Dict, List, Optional, Tuple, Union
+
+from lm_eval.api import metrics
+from lm_eval.utils import eval_logger, positional_deprecated
+
+
+class TaskOutput:
+    """
+    Wrapper class for Task outputs.It contains various attributes and methods to manage and calculate metrics for the task.
+
+        Attributes:
+            task (object): The task object.
+            task_name (str): The name of the task.
+            task_config (dict): The configuration of the task.
+            version (str): The version of the task.
+            group_name (str): The name of the task group.
+            n_shot (int): The number of shots for the task.
+            task_alias (str): The alias of the task.
+            group_alias (str): The alias of the task group.
+            is_group (bool): Indicates if the task is a group.
+            logged_samples (list): The list of logged samples.
+            sample_len (int): The length of the samples.
+            sample_metrics (defaultdict): The dictionary of samples' metrics.
+            agg_metrics (defaultdict): The dictionary of aggregate metrics.
+
+        Methods:
+            from_taskdict(cls, task_name: str, task):
+                Creates a TaskOutput instance from a task dictionary.
+
+            calculate_aggregate_metric(bootstrap_iters=100000) -> None:
+                Calculates the aggregate metrics for the task.
+    """
+
+    def __init__(
+        self,
+        task=None,
+        task_name=None,
+        task_config=None,
+        version=None,
+        group_name=None,
+        n_shot=None,
+        task_alias=None,
+        group_alias=None,
+        is_group=None,
+    ):
+        self.task = task
+        self.task_config = task_config
+        self.task_name = task_name
+        self.group_name = group_name
+        self.version = version
+        self.n_shot = n_shot
+        self.task_alias = task_alias
+        self.group_alias = group_alias
+        self.is_group = is_group
+        self.logged_samples = []
+        self.sample_len = None
+        self.sample_metrics = collections.defaultdict(list)
+        self.agg_metrics = collections.defaultdict(list)
+
+    @classmethod
+    def from_taskdict(cls, task_name: str, task):
+        if isinstance(task, tuple):
+            group_name, task = task
+        else:
+            group_name = None
+        if not task:
+            # these gets filtered out in get_task_list
+            # once they are added to group hierarchy
+            is_group = True
+            return cls(
+                task=task, task_name=task_name, is_group=is_group, group_name=group_name
+            )
+        version = task.VERSION
+        task_config = dict(task.dump_config())
+        if (n_shot := task_config.get("num_fewshot")) == 0:
+            n_shot = task_config.get("metadata", {}).get("num_fewshot", 0)
+        task_alias = task_config.get("alias")
+        group_alias = task_config.get("group_alias")
+        return cls(
+            task=task,
+            task_name=task_name,
+            task_config=task_config,
+            group_name=group_name,
+            version=version,
+            n_shot=n_shot,
+            task_alias=task_alias,
+            group_alias=group_alias,
+        )
+
+    def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None:
+        for (metric, filter_key), items in self.sample_metrics.items():
+            agg_fn = self.task.aggregation()[metric]
+            metric_key = f"{metric},{filter_key}"
+            self.agg_metrics[metric_key] = agg_fn(items)
+            self.sample_len = len(items)  # TODO: same sample size for each metric?
+            if bootstrap_iters:
+                stderr_fn = metrics.stderr_for_metric(
+                    metric=agg_fn,
+                    bootstrap_iters=min(bootstrap_iters, 100)
+                    if metric in ["bleu", "chrf", "ter"]
+                    else bootstrap_iters,
+                )
+                self.agg_metrics[f"{metric}_stderr,{filter_key}"] = (
+                    stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
+                )
+
+    def __repr__(self):
+        return (
+            f"TaskOutput(task_name={self.task_name}, "
+            f"group_name={self.group_name}, "
+            f"version={self.version},"
+            f"n_shot={self.n_shot}"
+            f"task_alias={self.task_alias}, group_alias={self.group_alias})"
+        )
+
+
+def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]:
+    task_hierarchy = collections.defaultdict(list)
+    outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items())
+    for task_output in outputs:
+        if group_name := task_output.group_name:
+            task_hierarchy[group_name].append(task_output.task_name)
+        else:
+            task_hierarchy[task_output.task_name] = []
+    # returns task_hierarchy tracking which groups contain which subtasks,
+    # and a list of TaskOutput classes for each non-group subtask
+    return task_hierarchy, [x for x in outputs if x.task]
+
+
+def print_writeout(task) -> None:
+    for inst in task.instances:
+        # print the prompt for the first few documents
+        if inst.doc_id < 1:
+            eval_logger.info(
+                f"Task: {task}; document {inst.doc_id}; context prompt (starting on next line):\
+    \n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
+            )
+            eval_logger.info(f"Request: {str(inst)}")
+
+
+def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
+    if limit is not None:
+        limit = (
+            int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit)
+        )
+    return limit
+
+
+def prepare_print_tasks(
+    task_hierarchy: dict, results: dict, tab=0
+) -> Tuple[dict, dict]:
+    """
+    @param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
+    value is a list of task names.
+    @param results: Dictionary containing the results of each task. Each key is a
+    group name and its value is a dictionary of task results.
+    @param tab: The indentation level for printing the task
+    hierarchy. Default is 0.
+    @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
+    aggregated results for each task, and groups_agg contains aggregated results for each group.
+
+    Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
+    """
+    results_agg = collections.defaultdict(dict)
+    groups_agg = collections.defaultdict(dict)
+
+    (group_name, task_list), *_ = task_hierarchy.items()
+    task_list = sorted(task_list)
+
+    results_agg[group_name] = results[group_name].copy()
+    # results_agg[group_name]["tab"] = tab
+    if "samples" in results_agg[group_name]:
+        results_agg[group_name].pop("samples")
+
+    tab_string = " " * tab + "- " if tab > 0 else ""
+
+    if "alias" in results_agg[group_name]:
+        results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"]
+    else:
+        results_agg[group_name]["alias"] = tab_string + group_name
+
+    if len(task_list) > 0:
+        groups_agg[group_name] = results[group_name].copy()
+        # groups_agg[group_name]["tab"] = tab
+        if "samples" in groups_agg[group_name]:
+            groups_agg[group_name].pop("samples")
+
+        if "alias" in groups_agg[group_name]:
+            groups_agg[group_name]["alias"] = (
+                tab_string + groups_agg[group_name]["alias"]
+            )
+        else:
+            groups_agg[group_name]["alias"] = tab_string + group_name
+
+        for task_name in task_list:
+            if task_name in task_hierarchy:
+                _task_hierarchy = {
+                    **{task_name: task_hierarchy[task_name]},
+                    **task_hierarchy,
+                }
+            else:
+                _task_hierarchy = {
+                    **{task_name: []},
+                    **task_hierarchy,
+                }
+
+            _results_agg, _groups_agg = prepare_print_tasks(
+                _task_hierarchy, results, tab + 1
+            )
+            results_agg = {**results_agg, **_results_agg}
+            groups_agg = {**groups_agg, **_groups_agg}
+
+    return results_agg, groups_agg
+
+
+def consolidate_results(
+    eval_tasks: List[TaskOutput],
+) -> Tuple[dict, dict, dict, dict, dict]:
+    """
+    @param eval_tasks: list(TaskOutput).
+    @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
+
+    Consolidates the results of multiple evaluation tasks into a single structure.
+
+    The method iterates over each evaluation instance and extracts relevant information to create the consolidated
+    results structure. The consolidated results structure has the following properties:
+
+    - results: A defaultdict with task names as keys and dictionaries as values. Each dictionary contains
+    metric/filter pairs as keys and corresponding metric values as values. The "alias" key is used to store task
+    aliases specified in the task configuration.
+    - samples: A defaultdict with task names as keys and lists of log samples as values.
+    - configs: A defaultdict with task names as keys and task configurations as values.
+    - versions: A defaultdict with task names as keys and task versions as values.
+    - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
+
+    The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
+    """
+    # stores the final result for each task, for each metric/filter pair.
+    results = collections.defaultdict(dict)
+    # logs info about each document evaluated.
+    samples = collections.defaultdict(list)
+    # store num-fewshot value per task
+    num_fewshot = collections.defaultdict(int)
+    # Tracks the YAML configs of all chosen task
+    configs = collections.defaultdict(dict)
+    # Tracks each task's version.
+    versions = collections.defaultdict(dict)
+    for task_output in eval_tasks:
+        if "task_alias" in (task_config := task_output.task_config):
+            results[task_output.task_name]["alias"] = task_config["task_alias"]
+        if group_alias := task_output.group_alias:
+            if group_alias not in results and (group_name := task_output.group_name):
+                results[group_name]["alias"] = group_alias
+        num_fewshot[task_output.task_name] = task_output.n_shot
+        configs[task_output.task_name] = task_output.task_config
+        versions[task_output.task_name] = task_output.version
+        samples[task_output.task_name] = task_output.logged_samples
+        for (metric, filter_key), items in task_output.sample_metrics.items():
+            metric_key = f"{metric},{filter_key}"
+            results[task_output.task_name][metric_key] = task_output.agg_metrics[
+                metric_key
+            ]
+            results[task_output.task_name]["samples"] = task_output.sample_len
+            results[task_output.task_name][
+                f"{metric}_stderr,{filter_key}"
+            ] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+    return results, samples, configs, versions, num_fewshot
+
+
+@positional_deprecated
+def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
+    """
+    Search upward in the directory tree to a maximum of three layers
+    to find and return the package root (containing the 'tests' folder)
+    """
+    cur_path = start_path.resolve()
+    max_layers = 3
+    for _ in range(max_layers):
+        if (cur_path / "tests" / "test_version_stable.py").exists():
+            return cur_path
+        else:
+            cur_path = cur_path.parent.resolve()
+    raise FileNotFoundError(
+        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
+    )
+
+
+@positional_deprecated
+def run_task_tests(task_list: List[str]):
+    """
+    Find the package root and run the tests for the given tasks
+    """
+    import pytest
+
+    package_root = find_test_root(start_path=pathlib.Path(__file__))
+    task_string = " or ".join(task_list)
+    args = [
+        f"{package_root}/tests/test_version_stable.py",
+        f"--rootdir={package_root}",
+        "-k",
+        f"{task_string}",
+    ]
+    sys.path.append(str(package_root))
+    pytest_return_val = pytest.main(args)
+    if pytest_return_val:
+        raise ValueError(
+            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
+        )
--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
+from functools import partial
+from typing import List
+
+from lm_eval.api.filter import FilterEnsemble
+from lm_eval.api.registry import get_filter
+
+from . import extraction, selection, transformation
+
+
+def build_filter_ensemble(
+    filter_name: str, components: List[List[str]]
+) -> FilterEnsemble:
+    """
+    Create a filtering pipeline.
+    """
+    filters = []
+    for function, kwargs in components:
+        if kwargs is None:
+            kwargs = {}
+        # create a filter given its name in the registry
+        f = partial(get_filter(function), **kwargs)
+        # add the filter as a pipeline step
+        filters.append(f)
+
+    return FilterEnsemble(name=filter_name, filters=filters)
--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+
+
+@register_filter("decontaminate")
+class DecontaminationFilter(Filter):
+
+    """
+    A filter which evaluates
+    """
+
+    name = "track_decontamination"
+
+    def __init__(self, path) -> None:
+        """
+
+        TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
+        should further cache result on a given (task_name, doc_id)
+        """
+        self._decontam_results = None
+
+    def apply(self, resps, docs) -> None:
+        """
+        Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
+        """
+        pass
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
+import re
+import sys
+import unicodedata
+
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+
+
+@register_filter("regex")
+class RegexFilter(Filter):
+    """ """
+
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+    ) -> None:
+        """
+        pass a string `regex` to run `re.compile(r"regex")` on.
+        `fallback` defines the output returned if no matches for the regex are located.
+        """
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
+        self.fallback = fallback
+
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+        def filter_set(inst):
+            filtered = []
+            for resp in inst:
+                match = self.regex.findall(resp)
+                if match:
+                    match = match[self.group_select]
+                    if isinstance(match, tuple):
+                        match = [m for m in match if m][0]
+                    match = match.strip()
+                else:
+                    match = self.fallback
+                filtered.append(match)
+            return filtered
+
+        # print(resps)
+        filtered_resps = list(map(lambda x: filter_set(x), resps))
+        # print(filtered_resps)
+
+        return filtered_resps
+
+
+@register_filter("remove_whitespace")
+class WhitespaceFilter(Filter):
+    """ """
+
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            filtered_resp = []
+            for resp in inst:
+                if resp.startswith(" "):
+                    resp = resp[1:]
+
+                filtered_resp.append(resp)
+
+            return filtered_resp
+
+        filtered_resps = [filter_set(resp) for resp in resps]
+
+        return filtered_resps
+
+
+@register_filter("multi_choice_regex")
+class MultiChoiceRegexFilter(RegexFilter):
+    """
+    A filter used to extract a model's answer on multiple choice questions with
+    letter answers. assumes each document has a "choices" field
+    containing the list of answer choices and that the answer label symbols
+    are of the form (A), (B), (C), ... or A, B, C.
+    """
+
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+
+        def find_match(regex, resp, convert_dict={}):
+            match = regex.findall(resp)
+            if match:
+                match = match[self.group_select]
+                if isinstance(match, tuple):
+                    match = [m for m in match if m][0]
+                match = match.strip()
+                if match and match in convert_dict:
+                    match = convert_dict[match]
+            return match
+
+        punct_tbl = dict.fromkeys(
+            i
+            for i in range(sys.maxunicode)
+            if unicodedata.category(chr(i)).startswith("P")
+        )
+
+        def filter_ignores(st):
+            if self.regexes_to_ignore is not None:
+                for s in self.regexes_to_ignore:
+                    st = re.sub(s, "", st)
+
+            if self.ignore_case:
+                st = st.lower()
+
+            if self.ignore_punctuation:
+                # https://stackoverflow.com/a/266162
+                st = st.translate(punct_tbl)
+            return st
+
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = "A"
+
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+
+            choices = doc["choices"]
+            for c in choices:
+                m = filter_ignores(c.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile("|".join(fallback_regexes))
+            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(
+                f":[\s]*({without_paren_fallback_regex})"
+            )
+
+            filtered = []
+            for resp in r:
+                match = find_match(self.regex, resp)
+                if not match:
+                    match = find_match(
+                        fallback_regex, filter_ignores(resp), choice_to_alpha
+                    )
+                    if not match:
+                        match = find_match(
+                            without_paren_fallback_regex, resp, without_paren_to_target
+                        )
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
--- a/lm_eval/filters/selection.py
+++ b/lm_eval/filters/selection.py
+from collections import Counter
+
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+
+
+# TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function
+# that takes an input and returns a scalar and then should select the max reward,
+# or should implement different filters for different ways of handling a reward model's inference.
+
+
+@register_filter("take_first")
+class TakeFirstFilter(Filter):
+    def __init__(self) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+
+    def apply(self, resps, docs):
+        """
+        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
+        """
+        return map(lambda r: r[0], resps)
+
+
+@register_filter("take_first_k")
+class TakeKFilter(Filter):
+    def __init__(self, **kwargs) -> None:
+        self.k = kwargs.pop("k")
+
+        super().__init__(**kwargs)
+
+    def apply(self, resps, docs):
+        # need resp to be subscriptable to check below
+        resps = list(resps)
+        # check we have at least k responses per doc, else we can't take the first k
+        assert (
+            len(resps[0]) >= self.k
+        ), f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
+        return map(lambda r: r[: self.k], resps)
+
+
+@register_filter("majority_vote")
+class MajorityVoteFilter(Filter):
+    def __init__(self) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+
+    def apply(self, resps, docs):
+        """
+        Each entry of `resps` is a list of model responses.
+        We select the response that occurs most frequently in each entry of `resps`.
+        """
+
+        def select_majority(resp):
+            counts = Counter(resp)
+            vote = counts.most_common(1)[0][0]
+            return vote
+
+        return map(lambda r: [select_majority(r)], resps)
--- a/lm_eval/filters/transformation.py
+++ b/lm_eval/filters/transformation.py
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+
+
+@register_filter("lowercase")
+class LowercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.lower() for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
+
+
+@register_filter("uppercase")
+class UppercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.upper() for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
+
+
+@register_filter("map")
+class MapFilter(Filter):
+    def __init__(self, mapping_dict: dict = None, default_value=None) -> None:
+        """
+        Initializes the MapFilter with a given mapping dictionary and default value.
+
+        Args:
+        - mapping_dict (dict): A dictionary containing the key-value mappings.
+                               Default is an empty dictionary.
+        - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
+                               Default is None.
+
+        Example:
+        mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
+        """
+        if mapping_dict is None:
+            mapping_dict = {}
+        assert isinstance(
+            mapping_dict, dict
+        ), "Provided mapping_dict is not a dictionary"
+        self.mapping_dict = mapping_dict
+        self.default_value = default_value
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
--- a/lm_eval/logging/__init__.py
+++ b/lm_eval/logging/__init__.py
+from .evaluation_tracker import EvaluationTracker
+from .wandb_logger import WandbLogger
--- a/lm_eval/logging/evaluation_tracker.py
+++ b/lm_eval/logging/evaluation_tracker.py
+import json
+import re
+import time
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+
+from huggingface_hub import HfApi
+
+from lm_eval.utils import (
+    eval_logger,
+    handle_non_serializable,
+    hash_string,
+)
+
+
+@dataclass(init=False)
+class GeneralConfigTracker:
+    """
+    Tracker for the evaluation parameters.
+
+    Attributes:
+        model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.)
+        model_name (str): Name of the model.
+        model_name_sanitized (str): Sanitized model name for directory creation.
+        start_time (float): Start time of the experiment. Logged at class init.
+        end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`]
+        total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times).
+    """
+
+    model_source: str = None
+    model_name: str = None
+    model_name_sanitized: str = None
+    start_time: float = None
+    end_time: float = None
+    total_evaluation_time_seconds: str = None
+
+    def __init__(self) -> None:
+        """Starts the evaluation timer."""
+        self.start_time = time.perf_counter()
+
+    @staticmethod
+    def _get_model_name(model_args: str) -> str:
+        """Extracts the model name from the model arguments."""
+
+        def extract_model_name(model_args: str, key: str) -> str:
+            """Extracts the model name from the model arguments using a key."""
+            args_after_key = model_args.split(key)[1]
+            return args_after_key.split(",")[0]
+
+        # order does matter, e.g. peft and delta are provided together with pretrained
+        prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
+        for prefix in prefixes:
+            if prefix in model_args:
+                return extract_model_name(model_args, prefix)
+        return ""
+
+    def log_experiment_args(
+        self,
+        model_source: str,
+        model_args: str,
+    ) -> None:
+        """Logs model parameters and job ID."""
+        self.model_source = model_source
+        self.model_name = GeneralConfigTracker._get_model_name(model_args)
+        self.model_name_sanitized = re.sub(
+            r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
+        )
+
+    def log_end_time(self) -> None:
+        """Logs the end time of the evaluation and calculates the total evaluation time."""
+        self.end_time = time.perf_counter()
+        self.total_evaluation_time_seconds = str(self.end_time - self.start_time)
+
+
+class EvaluationTracker:
+    """
+    Keeps track and saves relevant information of the evaluation process.
+    Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested.
+    """
+
+    def __init__(
+        self,
+        output_path: str = None,
+        hub_results_org: str = "",
+        hub_repo_name: str = "",
+        push_results_to_hub: bool = False,
+        push_samples_to_hub: bool = False,
+        public_repo: bool = False,
+        token: str = "",
+    ) -> None:
+        """
+        Creates all the necessary loggers for evaluation tracking.
+
+        Args:
+            output_path (str): Path to save the results. If not provided, the results won't be saved.
+            hub_results_org (str): The Hugging Face organisation to push the results to. If not provided, the results won't be pushed.
+            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
+            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
+            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
+            public_repo (bool): Whether to push the results to a public or private repository.
+            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
+        """
+        self.general_config_tracker = GeneralConfigTracker()
+
+        self.output_path = output_path
+        self.hub_results_org = hub_results_org
+        hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
+        self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
+        self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
+        self.push_results_to_hub = push_results_to_hub
+        self.push_samples_to_hub = push_samples_to_hub
+        self.public_repo = public_repo
+        self.api = HfApi(token=token) if token else None
+
+    def save_results_aggregated(
+        self,
+        results: dict,
+        samples: dict,
+    ) -> None:
+        """
+        Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested.
+
+        Args:
+            results (dict): The aggregated results to save.
+            samples (dict): The samples results to save.
+        """
+        self.general_config_tracker.log_end_time()
+
+        if self.output_path:
+            try:
+                eval_logger.info("Saving results aggregated")
+
+                # calculate cumulative hash for each task - only if samples are provided
+                task_hashes = {}
+                if samples:
+                    for task_name, task_samples in samples.items():
+                        sample_hashes = [
+                            s["doc_hash"] + s["prompt_hash"] + s["target_hash"]
+                            for s in task_samples
+                        ]
+                        task_hashes[task_name] = hash_string("".join(sample_hashes))
+
+                # update initial results dict
+                results.update({"task_hashes": task_hashes})
+                results.update(asdict(self.general_config_tracker))
+                dumped = json.dumps(
+                    results,
+                    indent=2,
+                    default=handle_non_serializable,
+                    ensure_ascii=False,
+                )
+
+                path = Path(self.output_path if self.output_path else Path.cwd())
+                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
+                path.mkdir(parents=True, exist_ok=True)
+
+                self.date_id = datetime.now().isoformat().replace(":", "-")
+                file_results_aggregated = path.joinpath(f"results_{self.date_id}.json")
+                file_results_aggregated.open("w", encoding="utf-8").write(dumped)
+
+                if self.api and self.push_results_to_hub:
+                    self.api.create_repo(
+                        repo_id=self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        repo_type="dataset",
+                        private=not self.public_repo,
+                        exist_ok=True,
+                    )
+                    self.api.upload_folder(
+                        repo_id=self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        folder_path=str(path),
+                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                        repo_type="dataset",
+                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
+                    )
+
+            except Exception as e:
+                eval_logger.warning("Could not save results aggregated")
+                eval_logger.info(repr(e))
+        else:
+            eval_logger.info(
+                "Output path not provided, skipping saving results aggregated"
+            )
+
+    def save_results_samples(
+        self,
+        task_name: str,
+        samples: dict,
+    ) -> None:
+        """
+        Saves the samples results to the output path and pushes them to the Hugging Face hub if requested.
+
+        Args:
+            task_name (str): The task name to save the samples for.
+            samples (dict): The samples results to save.
+        """
+        if self.output_path:
+            try:
+                eval_logger.info("Saving samples results")
+                samples_dumped = json.dumps(
+                    samples,
+                    indent=2,
+                    default=handle_non_serializable,
+                    ensure_ascii=False,
+                )
+
+                path = Path(self.output_path if self.output_path else Path.cwd())
+                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
+                path.mkdir(parents=True, exist_ok=True)
+
+                file_results_samples = path.joinpath(
+                    f"samples_{task_name}_{self.date_id}.json"
+                )
+                file_results_samples.write_text(samples_dumped, encoding="utf-8")
+
+                if self.api and self.push_samples_to_hub:
+                    self.api.create_repo(
+                        self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        repo_type="dataset",
+                        private=not self.public_repo,
+                        exist_ok=True,
+                    )
+                    self.api.upload_folder(
+                        repo_id=self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        folder_path=str(path),
+                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                        repo_type="dataset",
+                        commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
+                    )
+
+            except Exception as e:
+                eval_logger.warning("Could not save sample results")
+                eval_logger.info(repr(e))
+        else:
+            eval_logger.info("Output path not provided, skipping saving sample results")
--- a/lm_eval/logging/utils.py
+++ b/lm_eval/logging/utils.py
+import logging
+import os
+import re
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+from torch.utils.collect_env import get_pretty_env_info
+from transformers import __version__ as trans_version
+
+
+logger = logging.getLogger(__name__)
+
+
+def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
+    """Remove the ',none' substring from the input_string if it exists at the end.
+
+    Args:
+        input_string (str): The input string from which to remove the ',none' substring.
+
+    Returns:
+        Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
+                          and a boolean indicating whether the modification was made (True) or not (False).
+    """
+    # Define the pattern to match ',none' at the end of the string
+    pattern = re.compile(r",none$")
+
+    # Use sub() to replace ',none' with an empty string
+    result = re.sub(pattern, "", input_string)
+
+    # check if the input_string changed
+    removed = result != input_string
+
+    return result, removed
+
+
+def _handle_non_serializable(o: Any) -> Union[int, str, list]:
+    """Handle non-serializable objects by converting them to serializable types.
+
+    Args:
+        o (Any): The object to be handled.
+
+    Returns:
+        Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
+            it will be converted to int. If the object is of type set, it will be converted
+            to a list. Otherwise, it will be converted to str.
+    """
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+
+
+def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
+    try:
+        git_folder = Path(repo_path, ".git")
+        if git_folder.is_file():
+            git_folder = Path(
+                git_folder.parent,
+                git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
+            )
+        if Path(git_folder, "HEAD").exists():
+            head_name = (
+                Path(git_folder, "HEAD")
+                .read_text(encoding="utf-8")
+                .split("\n")[0]
+                .split(" ")[-1]
+            )
+            head_ref = Path(git_folder, head_name)
+            git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
+        else:
+            git_hash = None
+    except Exception as err:
+        logger.debug(
+            f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
+        )
+        return None
+    return git_hash
+
+
+def get_git_commit_hash():
+    """
+    Gets the git commit hash of your current repo (if it exists).
+    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
+    """
+    try:
+        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
+        git_hash = git_hash.decode()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # FileNotFoundError occurs when git not installed on system
+        git_hash = get_commit_from_path(os.getcwd())  # git hash of repo if exists
+    return git_hash
+
+
+def add_env_info(storage: Dict[str, Any]):
+    try:
+        pretty_env_info = get_pretty_env_info()
+    except Exception as err:
+        pretty_env_info = str(err)
+    transformers_version = trans_version
+    upper_dir_commit = get_commit_from_path(
+        Path(os.getcwd(), "..")
+    )  # git hash of upper repo if exists
+    added_info = {
+        "pretty_env_info": pretty_env_info,
+        "transformers_version": transformers_version,
+        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
+    }
+    storage.update(added_info)
--- a/lm_eval/logging/wandb_logger.py
+++ b/lm_eval/logging/wandb_logger.py
+import copy
+import json
+import logging
+from typing import Any, Dict, List, Literal, Tuple
+
+import numpy as np
+import pandas as pd
+from packaging.version import Version
+
+from lm_eval.logging.utils import _handle_non_serializable, remove_none_pattern
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_wandb_printer() -> Literal["Printer"]:
+    """Returns a wandb printer instance for pretty stdout."""
+    from wandb.sdk.lib.printer import get_printer
+    from wandb.sdk.wandb_settings import Settings
+
+    printer = get_printer(Settings()._jupyter)
+    return printer
+
+
+class WandbLogger:
+    def __init__(self, **kwargs) -> None:
+        """Attaches to wandb logger if already initialized. Otherwise, passes kwargs to wandb.init()
+
+        Args:
+            kwargs Optional[Any]: Arguments for configuration.
+
+        Parse and log the results returned from evaluator.simple_evaluate() with:
+            wandb_logger.post_init(results)
+            wandb_logger.log_eval_result()
+            wandb_logger.log_eval_samples(results["samples"])
+        """
+        try:
+            import wandb
+
+            assert Version(wandb.__version__) >= Version("0.13.6")
+            if Version(wandb.__version__) < Version("0.13.6"):
+                wandb.require("report-editing:v0")
+        except Exception as e:
+            logger.warning(
+                "To use the wandb reporting functionality please install wandb>=0.13.6.\n"
+                "To install the latest version of wandb run `pip install wandb --upgrade`\n"
+                f"{e}"
+            )
+
+        self.wandb_args: Dict[str, Any] = kwargs
+
+        # initialize a W&B run
+        if wandb.run is None:
+            self.run = wandb.init(**self.wandb_args)
+        else:
+            self.run = wandb.run
+
+        self.printer = get_wandb_printer()
+
+    def post_init(self, results: Dict[str, Any]) -> None:
+        self.results: Dict[str, Any] = copy.deepcopy(results)
+        self.task_names: List[str] = list(results.get("results", {}).keys())
+        self.group_names: List[str] = list(results.get("groups", {}).keys())
+
+    def _get_config(self) -> Dict[str, Any]:
+        """Get configuration parameters."""
+        self.task_configs = self.results.get("configs", {})
+        cli_configs = self.results.get("config", {})
+        configs = {
+            "task_configs": self.task_configs,
+            "cli_configs": cli_configs,
+        }
+
+        return configs
+
+    def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]:
+        """Sanitize the results dictionary."""
+        _results = copy.deepcopy(self.results.get("results", dict()))
+
+        # Remove None from the metric string name
+        tmp_results = copy.deepcopy(_results)
+        for task_name in self.task_names:
+            task_result = tmp_results.get(task_name, dict())
+            for metric_name, metric_value in task_result.items():
+                _metric_name, removed = remove_none_pattern(metric_name)
+                if removed:
+                    _results[task_name][_metric_name] = metric_value
+                    _results[task_name].pop(metric_name)
+
+        # remove string valued keys from the results dict
+        wandb_summary = {}
+        for task in self.task_names:
+            task_result = _results.get(task, dict())
+            for metric_name, metric_value in task_result.items():
+                if isinstance(metric_value, str):
+                    wandb_summary[f"{task}/{metric_name}"] = metric_value
+
+        for summary_metric, summary_value in wandb_summary.items():
+            _task, _summary_metric = summary_metric.split("/")
+            _results[_task].pop(_summary_metric)
+
+        tmp_results = copy.deepcopy(_results)
+        for task_name, task_results in tmp_results.items():
+            for metric_name, metric_value in task_results.items():
+                _results[f"{task_name}/{metric_name}"] = metric_value
+                _results[task_name].pop(metric_name)
+        for task in self.task_names:
+            _results.pop(task)
+
+        return wandb_summary, _results
+
+    def _log_results_as_table(self) -> None:
+        """Generate and log evaluation results as a table to W&B."""
+        columns = [
+            "Version",
+            "Filter",
+            "num_fewshot",
+            "Metric",
+            "Value",
+            "Stderr",
+        ]
+
+        def make_table(columns: List[str], key: str = "results"):
+            import wandb
+
+            table = wandb.Table(columns=columns)
+            results = copy.deepcopy(self.results)
+
+            for k, dic in results.get(key).items():
+                if k in self.group_names and not key == "groups":
+                    continue
+                version = results.get("versions").get(k)
+                if version == "N/A":
+                    version = None
+                n = results.get("n-shot").get(k)
+
+                for (mf), v in dic.items():
+                    m, _, f = mf.partition(",")
+                    if m.endswith("_stderr"):
+                        continue
+                    if m == "alias":
+                        continue
+
+                    if m + "_stderr" + "," + f in dic:
+                        se = dic[m + "_stderr" + "," + f]
+                        if se != "N/A":
+                            se = "%.4f" % se
+                        table.add_data(*[k, version, f, n, m, str(v), str(se)])
+                    else:
+                        table.add_data(*[k, version, f, n, m, str(v), ""])
+
+            return table
+
+        # log the complete eval result to W&B Table
+        table = make_table(["Tasks"] + columns, "results")
+        self.run.log({"evaluation/eval_results": table})
+
+        if "groups" in self.results.keys():
+            table = make_table(["Groups"] + columns, "groups")
+            self.run.log({"evaluation/group_eval_results": table})
+
+    def _log_results_as_artifact(self) -> None:
+        """Log results as JSON artifact to W&B."""
+        import wandb
+
+        dumped = json.dumps(
+            self.results, indent=2, default=_handle_non_serializable, ensure_ascii=False
+        )
+        artifact = wandb.Artifact("results", type="eval_results")
+        with artifact.new_file("results.json", mode="w", encoding="utf-8") as f:
+            f.write(dumped)
+        self.run.log_artifact(artifact)
+
+    def log_eval_result(self) -> None:
+        """Log evaluation results to W&B."""
+        # Log configs to wandb
+        configs = self._get_config()
+        self.run.config.update(configs)
+
+        wandb_summary, self.wandb_results = self._sanitize_results_dict()
+        # update wandb.run.summary with items that were removed
+        self.run.summary.update(wandb_summary)
+        # Log the evaluation metrics to wandb
+        self.run.log(self.wandb_results)
+        # Log the evaluation metrics as W&B Table
+        self._log_results_as_table()
+        # Log the results dict as json to W&B Artifacts
+        self._log_results_as_artifact()
+
+    def _generate_dataset(
+        self, data: List[Dict[str, Any]], config: Dict[str, Any]
+    ) -> pd.DataFrame:
+        """Generate a dataset from evaluation data.
+
+        Args:
+            data (List[Dict[str, Any]]): The data to generate a dataset for.
+            config (Dict[str, Any]): The configuration of the task.
+
+        Returns:
+            pd.DataFrame: A dataframe that is ready to be uploaded to W&B.
+        """
+        ids = [x["doc_id"] for x in data]
+        labels = [x["target"] for x in data]
+        instance = [""] * len(ids)
+        resps = [""] * len(ids)
+        filtered_resps = [""] * len(ids)
+        model_outputs = {}
+
+        metrics_list = config["metric_list"]
+        metrics = {}
+        for metric in metrics_list:
+            metric = metric.get("metric")
+            if metric in ["word_perplexity", "byte_perplexity", "bits_per_byte"]:
+                metrics[f"{metric}_loglikelihood"] = [x[metric][0] for x in data]
+                if metric in ["byte_perplexity", "bits_per_byte"]:
+                    metrics[f"{metric}_bytes"] = [x[metric][1] for x in data]
+                else:
+                    metrics[f"{metric}_words"] = [x[metric][1] for x in data]
+            else:
+                metrics[metric] = [x[metric] for x in data]
+
+        if config["output_type"] == "loglikelihood":
+            instance = [x["arguments"][0][0] for x in data]
+            labels = [x["arguments"][0][1] for x in data]
+            resps = [
+                f'log probability of continuation is {x["resps"][0][0][0]} '
+                + "\n\n"
+                + "continuation will {} generated with greedy sampling".format(
+                    "not be" if not x["resps"][0][0][1] else "be"
+                )
+                for x in data
+            ]
+            filtered_resps = [
+                f'log probability of continuation is {x["filtered_resps"][0][0]} '
+                + "\n\n"
+                + "continuation will {} generated with greedy sampling".format(
+                    "not be" if not x["filtered_resps"][0][1] else "be"
+                )
+                for x in data
+            ]
+        elif config["output_type"] == "multiple_choice":
+            instance = [x["arguments"][0][0] for x in data]
+            choices = [
+                "\n".join([f"{idx}. {y[1]}" for idx, y in enumerate(x["arguments"])])
+                for x in data
+            ]
+            resps = [np.argmax([n[0][0] for n in x["resps"]]) for x in data]
+            filtered_resps = [
+                np.argmax([n[0] for n in x["filtered_resps"]]) for x in data
+            ]
+        elif config["output_type"] == "loglikelihood_rolling":
+            instance = [x["arguments"][0][0] for x in data]
+            resps = [x["resps"][0][0] for x in data]
+            filtered_resps = [x["filtered_resps"][0] for x in data]
+        elif config["output_type"] == "generate_until":
+            instance = [x["arguments"][0][0] for x in data]
+            resps = [x["resps"][0][0] for x in data]
+            filtered_resps = [x["filtered_resps"][0] for x in data]
+
+        model_outputs["raw_predictions"] = resps
+        model_outputs["filtered_predictions"] = filtered_resps
+
+        df_data = {
+            "id": ids,
+            "data": instance,
+        }
+        if config["output_type"] == "multiple_choice":
+            df_data["choices"] = choices
+
+        tmp_data = {
+            "input_len": [len(x) for x in instance],
+            "labels": labels,
+            "output_type": config["output_type"],
+        }
+        df_data.update(tmp_data)
+        df_data.update(model_outputs)
+        df_data.update(metrics)
+
+        return pd.DataFrame(df_data)
+
+    def _log_samples_as_artifact(
+        self, data: List[Dict[str, Any]], task_name: str
+    ) -> None:
+        import wandb
+
+        # log the samples as an artifact
+        dumped = json.dumps(
+            data,
+            indent=2,
+            default=_handle_non_serializable,
+            ensure_ascii=False,
+        )
+        artifact = wandb.Artifact(f"{task_name}", type="samples_by_task")
+        with artifact.new_file(
+            f"{task_name}_eval_samples.json", mode="w", encoding="utf-8"
+        ) as f:
+            f.write(dumped)
+        self.run.log_artifact(artifact)
+        # artifact.wait()
+
+    def log_eval_samples(self, samples: Dict[str, List[Dict[str, Any]]]) -> None:
+        """Log evaluation samples to W&B.
+
+        Args:
+            samples (Dict[str, List[Dict[str, Any]]]): Evaluation samples for each task.
+        """
+        task_names: List[str] = [
+            x for x in self.task_names if x not in self.group_names
+        ]
+
+        ungrouped_tasks = []
+        tasks_by_groups = {}
+
+        for task_name in task_names:
+            group_names = self.task_configs[task_name].get("group", None)
+            if group_names:
+                if isinstance(group_names, str):
+                    group_names = [group_names]
+
+                for group_name in group_names:
+                    if not tasks_by_groups.get(group_name):
+                        tasks_by_groups[group_name] = [task_name]
+                    else:
+                        tasks_by_groups[group_name].append(task_name)
+            else:
+                ungrouped_tasks.append(task_name)
+
+        for task_name in ungrouped_tasks:
+            eval_preds = samples[task_name]
+
+            # log the samples as a W&B Table
+            df = self._generate_dataset(eval_preds, self.task_configs.get(task_name))
+            self.run.log({f"{task_name}_eval_results": df})
+
+            # log the samples as a json file as W&B Artifact
+            self._log_samples_as_artifact(eval_preds, task_name)
+
+        for group, grouped_tasks in tasks_by_groups.items():
+            grouped_df = pd.DataFrame()
+            for task_name in grouped_tasks:
+                eval_preds = samples[task_name]
+                df = self._generate_dataset(
+                    eval_preds, self.task_configs.get(task_name)
+                )
+                df["group"] = group
+                df["task"] = task_name
+                grouped_df = pd.concat([grouped_df, df], ignore_index=True)
+
+                # log the samples as a json file as W&B Artifact
+                self._log_samples_as_artifact(eval_preds, task_name)
+
+            self.run.log({f"{group}_eval_results": grouped_df})