merged latest update

0348ed97 · lintangsutawika · 451a1873 · 6769119f · 451a1873 · 451a1873
Commit 0348ed97 authored Oct 09, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/datasets/README.md
+++ b/lm_eval/datasets/README.md
-# datasets
-This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not.
-__NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path.
-__WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed.
--- a/lm_eval/datasets/asdiv/__init__.py
+++ b/lm_eval/datasets/asdiv/__init__.py
--- a/lm_eval/datasets/asdiv/asdiv.py
+++ b/lm_eval/datasets/asdiv/asdiv.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""ASDIV dataset."""
-import os
-import xml.etree.ElementTree as ET
-import datasets
-_CITATION = """\
-@misc{miao2021diverse,
-    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
-    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
-    year={2021},
-    eprint={2106.15772},
-    archivePrefix={arXiv},
-    primaryClass={cs.AI}
-}
-"""
-_DESCRIPTION = """\
-ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
-patterns and problem types) English math word problem (MWP) corpus for evaluating
-the capability of various MWP solvers. Existing MWP corpora for studying AI progress
-remain limited either in language usage patterns or in problem types. We thus present
-a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
-types taught in elementary school. Each MWP is annotated with its problem type and grade
-level (for indicating the level of difficulty).
-"""
-_HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-_URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
-class ASDiv(datasets.GeneratorBasedBuilder):
-    """ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
-    VERSION = datasets.Version("0.0.1")
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name="asdiv",
-            version=VERSION,
-            description="A diverse corpus for evaluating and developing english math word problem solvers",
-        )
-    ]
-    def _info(self):
-        features = datasets.Features(
-            {
-                "body": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "solution_type": datasets.Value("string"),
-                "answer": datasets.Value("string"),
-                "formula": datasets.Value("string"),
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        urls = _URLS
-        data_dir = dl_manager.download_and_extract(urls)
-        base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50"
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, base_filepath, "dataset", "ASDiv.xml"
-                    ),
-                    "split": datasets.Split.VALIDATION,
-                },
-            ),
-        ]
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        tree = ET.parse(filepath)
-        root = tree.getroot()
-        for key, problem in enumerate(root.iter("Problem")):
-            yield key, {
-                "body": problem.find("Body").text,
-                "question": problem.find("Question").text,
-                "solution_type": problem.find("Solution-Type").text,
-                "answer": problem.find("Answer").text,
-                "formula": problem.find("Formula").text,
-            }
--- a/lm_eval/datasets/asdiv/dataset_infos.json
+++ b/lm_eval/datasets/asdiv/dataset_infos.json
-{"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n    year={2021},\n    eprint={2106.15772},\n    archivePrefix={arXiv},\n    primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
 import os
+from typing import Any
 import zstandard
 import json
 import jsonlines
@@ -9,7 +10,7 @@ import tqdm
 from pathlib import Path
-def json_serial(obj):
+def json_serial(obj: Any) -> str:
    """JSON serializer for objects not serializable by default json code"""
    if isinstance(obj, (datetime.datetime,)):
@@ -19,7 +20,7 @@ def json_serial(obj):
 # Modified version of lm_dataformat Archive for single file.
 class Archive:
-    def __init__(self, file_path, compression_level=3):
+    def __init__(self, file_path: str, compression_level: int = 3) -> None:
        self.file_path = file_path
        dir_name = os.path.dirname(file_path)
        if dir_name:
@@ -28,7 +29,7 @@ class Archive:
        self.cctx = zstandard.ZstdCompressor(level=compression_level)
        self.compressor = self.cctx.stream_writer(self.fh)
-    def add_data(self, data, meta={}):
+    def add_data(self, data, meta={}) -> None:
        self.compressor.write(
            json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
                "UTF-8"
@@ -36,7 +37,7 @@ class Archive:
            + b"\n"
        )
-    def commit(self):
+    def commit(self) -> None:
        self.compressor.flush(zstandard.FLUSH_FRAME)
        self.fh.flush()
        self.fh.close()
@@ -44,10 +45,16 @@ class Archive:
 # Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
 class Reader:
-    def __init__(self):
+    def __init__(self) -> None:
        pass
-    def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"):
+    def read(
+        self,
+        file,
+        get_meta: bool = False,
+        autojoin_paragraphs: bool = True,
+        para_joiner: str = "\n\n",
+    ):
        with open(file, "rb") as fh:
            self.fh = fh
            cctx = zstandard.ZstdDecompressor()
@@ -72,7 +79,7 @@ class Reader:
 class TextArchive:
-    def __init__(self, file_path, mode="rb+"):
+    def __init__(self, file_path, mode: str = "rb+") -> None:
        self.file_path = file_path
        dir_name = os.path.dirname(file_path)
        if dir_name:
@@ -83,21 +90,21 @@ class TextArchive:
        self.fh = open(self.file_path, mode)
-    def add_data(self, data):
+    def add_data(self, data) -> None:
        self.fh.write(data.encode("UTF-8") + b"\n")
-    def commit(self):
+    def commit(self) -> None:
        self.fh.flush()
        self.fh.close()
 class TextReader:
-    def __init__(self, file_path):
+    def __init__(self, file_path) -> None:
        self.file_path = file_path
    # Optimized mmap read with infrequent tqdm updates to maintain speed
    # Tested up to 250MB/s.
-    def read_tqdm(self, update_frequency=10000):
+    def read_tqdm(self, update_frequency: int = 10000):
        current_file_position = 0
        line_counter = 0
        with open(self.file_path, "r") as fh, tqdm.tqdm(
@@ -149,7 +156,7 @@ class TextReader:
 # Optimized for speed. Decompresses the archive in shell before
 # using the mmap'd TextReader.
 class ZStdTextReader:
-    def __init__(self, file):
+    def __init__(self, file) -> None:
        self.file = file
    def read_tqdm(self):

--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
@@ -11,7 +11,7 @@ from .archiver import ZStdTextReader
 # Was used for testing the evaluator decoupled from the full logic below
-def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
+def get_train_overlap_stub(docs: dict, ngrams_path: str, ngrams_n_size: str):
    simulated_overlap = 0.1
    contaminated = int(len(docs) * simulated_overlap)
    return random.sample(range(len(docs)), contaminated)
@@ -25,6 +25,7 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
 # scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
 # files. These should exist in the "ngrams_path" provided to this function.
 # Algorithm:
 # 1. Build lookups for each dataset {ngram: list(document_ids)}
 # 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
@@ -33,7 +34,7 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
 # 4. Strip the task_set from the dictionary keys and return
 #
 # We cache the task+set lookups as well as the overlaps.
-def get_train_overlap(docs_by_task_set, ngrams_path, limit):
+def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> dict:
    # return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)
    info_dict_path = os.path.join(ngrams_path, "info.json")
@@ -46,7 +47,7 @@ def get_train_overlap(docs_by_task_set, ngrams_path, limit):
    print("Building Lookups...")
    start = time.perf_counter()
-    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
+    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
        return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"
    lookups = {}

--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
 import re
 import string
-import timeit
 import pickle
 import traceback
 from pprint import pprint
+from typing import Iterator, Sequence, TypeVar, List, Tuple
 # This is a cpp module. Compile janitor_util.cpp with:
 # c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
@@ -16,10 +16,12 @@ except Exception:
    traceback.print_exc()
    JANITOR_CPP = False
+T = TypeVar("T")
 # Implementation from nltk source
 # https://www.nltk.org/_modules/nltk/util.html
-def form_ngrams(sequence, n):
+def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
    history = []
    while n > 1:
        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
@@ -36,7 +38,7 @@ def form_ngrams(sequence, n):
        del history[0]
-def word_ngrams(s, n):
+def word_ngrams(s: str, n: int) -> Iterator[str]:
    """Splits a string into ngram words"""
    tokens = s.split()  # not a generator :(
    ngram_seqs = form_ngrams(iter(tokens), n)
@@ -68,14 +70,14 @@ def word_ngrams(s, n):
 # https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
-def split_indices(s):
+def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
    """Splits a string on whitespaces and records the indices of each in the original string.
    @:return generator((word, (start_idx, end_idx)), ...)
    """
    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
-def word_ngrams_indices(s, n):
+def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
    """Splits a string into pairs of (ngram words, their start/end indices)"""
    tokens_with_indices = split_indices(s)
@@ -104,16 +106,15 @@ def word_ngrams_indices(s, n):
 class Janitor:
    # FIXME delete_chars: Should anything else go here? Special chars?
    def __init__(
        self,
-        ngram_n=13,
+        ngram_n: int = 13,
-        window_to_remove=200,
+        window_to_remove: int = 200,
-        too_dirty_cutoff=10,
+        too_dirty_cutoff: int = 10,
-        minimum_slice_length=200,
+        minimum_slice_length: int = 200,
-        delete_chars=string.punctuation,
+        delete_chars: str = string.punctuation,
-    ):
+    ) -> None:
        self.ngram_n = ngram_n
        self.window_to_remove = window_to_remove
        self.too_dirty_cutoff = too_dirty_cutoff
@@ -135,11 +136,11 @@ class Janitor:
    # I/O for saving contamination ngrams
    ##############
-    def save_contamination_ngrams(self, filename):
+    def save_contamination_ngrams(self, filename: str) -> None:
        with open(filename, "wb") as fp:
            pickle.dump(filename, fp)
-    def load_contamination_ngrams(self, filename):
+    def load_contamination_ngrams(self, filename: str) -> None:
        with open(filename, "rb") as fp:
            self.dirt_ngrams = pickle.load(fp)
@@ -147,7 +148,7 @@ class Janitor:
    # Call these :)
    ##############
-    def register_contaminant(self, dirt_string):
+    def register_contaminant(self, dirt_string: str) -> None:
        """Register a string as contamination to be removed, e.g. a test set
        This breaks the dirt_string into ngrams to store for future cleaning"""
        if JANITOR_CPP:
@@ -156,7 +157,7 @@ class Janitor:
            print("WARNING: Janitor running in python mode")
            return self.register_contaminant_python(dirt_string)
-    def clean(self, dirty_string):
+    def clean(self, dirty_string: str) -> List[str]:
        """Clean a string (e.g. a training set) by removing all ngrams previously
        registered as contaminants. Returns a list of clean chunks, or empty if
        the string was too dirty"""
@@ -166,7 +167,9 @@ class Janitor:
            print("WARNING: Janitor running in python mode")
            return self.clean_python(dirty_string)
-    def _split_chunks(self, dirty_string, dirty_parts):
+    def _split_chunks(
+        self, dirty_string: str, dirty_parts: Sequence[Tuple]
+    ) -> List[str]:
        clean_chunks = []
        splice_idx = 0
        end = -1
@@ -189,12 +192,12 @@ class Janitor:
    # Fast C++
    ##############
-    def register_contaminant_cpp(self, dirt_string):
+    def register_contaminant_cpp(self, dirt_string) -> None:
        self.dirt_ngrams.update(
            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
        )
-    def clean_cpp(self, dirty_string):
+    def clean_cpp(self, dirty_string: str) -> List[str]:
        contamination_indices = janitor_util.clean_ngram_with_indices(
            dirty_string, self.delete_chars, self.ngram_n
        )
@@ -204,15 +207,15 @@ class Janitor:
    # Slow python
    ##############
-    def normalize_string(self, s):
+    def normalize_string(self, s: str) -> str:
        return s.translate(self.translation_table)
-    def register_contaminant_python(self, dirt_string):
+    def register_contaminant_python(self, dirt_string: str) -> None:
        self.dirt_ngrams.update(
            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
        )
-    def clean_python(self, dirty_string):
+    def clean_python(self, dirty_string: str) -> List[str]:
        contamination_indices = (
            (None, *idx_pair)
            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -11,7 +11,6 @@ import numpy as np
 import lm_eval.api
 import lm_eval.tasks
-import lm_eval.benchmarks
 import lm_eval.models
 import lm_eval.api.metrics
 import lm_eval.api.registry
@@ -42,11 +41,11 @@ def simple_evaluate(
    device=None,
    use_cache=None,
    limit=None,
-    bootstrap_iters=100000,
+    bootstrap_iters: int = 100000,
-    check_integrity=False,
+    check_integrity: bool = False,
    decontamination_ngrams_path=None,
-    write_out=False,
+    write_out: bool = False,
-    log_samples=True,
+    log_samples: bool = True,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -117,10 +116,11 @@ def simple_evaluate(
    task_dict = lm_eval.tasks.get_task_dict(tasks)
    for task_name in task_dict.keys():
        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
            group, task_obj = task_obj
+            if task_obj is None:
+                continue
        config = task_obj._config
        if num_fewshot is not None:
@@ -175,17 +175,17 @@ def evaluate(
    lm,
    task_dict,
    limit=None,
-    bootstrap_iters=100000,
+    bootstrap_iters: int = 100000,
    decontamination_ngrams_path=None,
-    write_out=False,
+    write_out: bool = False,
-    log_samples=True,
+    log_samples: bool = True,
 ):
    """Instantiate and evaluate a model on a list of tasks.
    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
-        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
+        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
@@ -210,24 +210,30 @@ def evaluate(
    samples = collections.defaultdict(list)
    # tracks all Instances/requests a model must generate output on.
    requests = collections.defaultdict(list)
-    # Stores task scores based on task grouping.
+    # Aggregated task scores presented with groups
-    aggregate = collections.defaultdict(dict)
+    results_agg = collections.defaultdict(dict)
-    # tracks if a task was chosen via user selecting a group containing it
+    # Aggregated groups scores only
-    task_groups = collections.defaultdict(dict)
+    groups_agg = collections.defaultdict(dict)
    # stores the amount to pad out reqs per req. type so that
    # number of fwd passes per distributed rank is equal
    padding_requests = collections.defaultdict(int)
+    # store the hierarchy to do proper ordering
-    # Stores group related keys and values for group-aggregation
+    task_hierarchy = collections.defaultdict(list)
-    task_groups = collections.defaultdict(dict)
+    # store the ordering of tasks and groups
+    task_order = collections.defaultdict(int)
+    # store the aggregation for aggregating across tasks in the same group
+    sample_agg_fn = collections.defaultdict(dict)
    # get lists of each type of request
    for task_name, task in task_dict.items():
        if type(task) == tuple:
-            group, task = task
+            group_name, task = task
-            task_groups[task_name] = group
+            task_hierarchy[group_name].append(task_name)
-            aggregate[task_name] = {}
+        else:
+            task_hierarchy[task_name] = []
+        if task is None:
+            continue
        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())
@@ -252,7 +258,8 @@ def evaluate(
                # print the prompt for the first few documents
                if inst.doc_id < 1:
                    eval_logger.info(
-                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\n{inst.args[0]}\n(end of prompt on previous line)"
+                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\
+\n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
                    )
                    eval_logger.info(f"Request: {str(inst)}")
@@ -302,6 +309,8 @@ def evaluate(
    for task_name, task in task_dict.items():
        if type(task) == tuple:
            group, task = task
+            if task is None:
+                continue
        task.apply_filters()
    ### Collect values of metrics on all datapoints ###
@@ -311,6 +320,8 @@ def evaluate(
    for task_name, task in task_dict.items():
        if type(task) == tuple:
            group, task = task
+            if task is None:
+                continue
        # TODO: make it possible to use a different metric per filter
        # iterate over different filters used
        for key in task.instances[0].filtered_resps.keys():
@@ -349,7 +360,6 @@ def evaluate(
        # if multigpu, then gather data across all ranks
        # first gather logged samples across all ranks
        for task_name, task_samples in list(samples.items()):
            full_samples = [None] * lm.world_size
            torch.distributed.all_gather_object(full_samples, task_samples)
@@ -358,33 +368,39 @@ def evaluate(
        # then collect metrics across all ranks
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
            numitem = 0
            if type(items[0]) == tuple:
                numitem = len(items[0])
-            # distributed gather requires all ranks to have same dimensions
+            if isinstance(items[0], (str, list)):
-            # so we pad out with float32 min value
+                # handle the string case
-            pad_value = torch.finfo(torch.float32).min
+                gathered_items = [None] * lm.accelerator.num_processes
-            metrics_tensor = torch.tensor(items, device=lm.device)
+                torch.distributed.all_gather_object(gathered_items, items)
-            original_dtype = metrics_tensor.dtype  # store original dtype
-            torch_device_tensor = lm.accelerator.pad_across_processes(
-                metrics_tensor.to(torch.float32), pad_index=pad_value
-            )
-            gathered_item = lm.accelerator.gather(torch_device_tensor)
-            if numitem > 0:
+                gathered_item = list(itertools.chain.from_iterable(gathered_items))
-                gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
            else:
-                gathered_filtered = gathered_item[gathered_item != pad_value]
+                # distributed gather requires all ranks to have same dimensions
+                # so we pad out with float32 min value
+                pad_value = torch.finfo(torch.float32).min
+                metrics_tensor = torch.tensor(items, device=lm.device)
+                original_dtype = metrics_tensor.dtype  # store original dtype
+                torch_device_tensor = lm.accelerator.pad_across_processes(
+                    metrics_tensor.to(torch.float32), pad_index=pad_value
+                )
+                gathered_item = lm.accelerator.gather(torch_device_tensor)
-            gathered_item = (
+                if numitem > 0:
-                gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+                    gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
-            )
+                else:
-            # reconvert if we were passed a tuple of values
+                    gathered_filtered = gathered_item[gathered_item != pad_value]
-            if numitem > 0:
-                gathered_item = [tuple(g) for g in gathered_item]
+                gathered_item = (
+                    gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+                )
+                # reconvert if we were passed a tuple of values
+                if numitem > 0:
+                    gathered_item = [tuple(g) for g in gathered_item]
            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
@@ -392,34 +408,71 @@ def evaluate(
        vals = vals_torch
    if lm.rank == 0:
+        ### Get task ordering for correct sample-wide aggregation
+        group_to_task = {}
+        for group in task_hierarchy.keys():
+            if group not in task_order:
+                task_order[group] = 0
+            if len(task_hierarchy[group]) > 0:
+                group_to_task[group] = task_hierarchy[group].copy()
+            for task in task_hierarchy[group]:
+                if task in task_order:
+                    task_order[task] += 1
+                else:
+                    task_order[task] = 1 + task_order[group]
+                if task in task_hierarchy:
+                    group_to_task[group].remove(task)
+                    group_to_task[group].extend(task_hierarchy[task])
+        task_to_group = {}
+        for group in group_to_task:
+            for task in group_to_task[group]:
+                if task in task_to_group:
+                    task_to_group[task].append(group)
+                else:
+                    task_to_group[task] = [group]
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
+            metric_key = metric + "," + key
            if type(task) == tuple:
-                group, task = task
+                group_name, task = task
-            task_score = task.aggregation()[metric](items)
+            else:
-            results[task_name][metric + "," + key] = task_score
+                group_name = None
-            # Need to put back in results
+            agg_fn = task.aggregation()[metric]
-            # pythia | acc
+            task_score = agg_fn(items)
-            #        | perplexity
-            #        | word_perplexity
+            if group_name is not None:
-            #        | byte_perplexity
+                sample_metric_key = metric + "(sample agg)," + key
-            #        | bits_per_byte
+                for grouping in task_to_group[task_name]:
-            if task_name in task_groups:
+                    if metric_key in results[grouping]:
-                group_name = task_groups[task_name]
+                        results[grouping][metric_key].append(task_score)
-                if metric in list(aggregate[group_name].keys()):
+                    else:
-                    aggregate[group_name][metric].append(task_score)
+                        results[grouping][metric_key] = [task_score]
-                else:
-                    aggregate[group_name][metric] = [task_score]
+                    if sample_metric_key in results[grouping]:
+                        results[grouping][sample_metric_key] += items
+                    else:
+                        results[grouping][sample_metric_key] = items.copy()
+                        sample_agg_fn[grouping][sample_metric_key] = agg_fn
+            results[task_name][metric_key] = task_score
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
            if bootstrap_iters > 0:
                stderr = lm_eval.api.metrics.stderr_for_metric(
                    metric=task.aggregation()[metric],
-                    bootstrap_iters=min(bootstrap_iters, 1000)
+                    bootstrap_iters=min(bootstrap_iters, 100)
                    if metric in ["bleu", "chrf", "ter"]
                    else bootstrap_iters,
                )
@@ -427,19 +480,38 @@ def evaluate(
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
-        if bool(aggregate):
+        if bool(results):
-            for group in aggregate.keys():
+            for task_or_group in results.keys():
-                for metric in aggregate[group].keys():
+                for metric in results[task_or_group].keys():
-                    aggregate[group][metric] = np.average(aggregate[group][metric])
+                    if type(results[task_or_group][metric]) == list:
-                    versions[group] = "N/A"
+                        if "(sample agg)" in metric:
+                            results[task_or_group][metric] = sample_agg_fn[
+                                task_or_group
+                            ][metric](results[task_or_group][metric])
+                        else:
+                            results[task_or_group][metric] = np.average(
+                                results[task_or_group][metric]
+                            )
+                        versions[task_or_group] = "N/A"
+        for task_name, task in task_dict.items():
+            if type(task) == tuple:
+                group_name, task = task
+                order = task_order[group_name]
+                tabbed_name = "-" * order + group_name
+                results_agg[tabbed_name] = results[group_name]
+                versions[tabbed_name] = versions[group_name]
+                if order == 0:
+                    groups_agg[group_name] = results[group_name]
+            order = task_order[task_name]
+            tabbed_name = "-" * order + task_name
+            results_agg[tabbed_name] = results[task_name]
+            versions[tabbed_name] = versions[task_name]
        results_dict = {
-            "results": dict(sorted(results.items())),
+            "results": dict(results_agg.items()),
-            **(
+            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
-                {"aggregate": dict(sorted(aggregate.items()))}
-                if bool(aggregate)
-                else {}
-            ),
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
        }

--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
 from lm_eval.api.filter import FilterEnsemble
 from . import selection
 from . import extraction
+from . import transformation
 FILTER_REGISTRY = {
@@ -10,6 +11,9 @@ FILTER_REGISTRY = {
    "take_first_k": selection.TakeKFilter,
    "remove_whitespace": extraction.WhitespaceFilter,
    "cot_filter": extraction.CoTFilter,
+    "lowercase": transformation.LowercaseFilter,
+    "uppercase": transformation.UppercaseFilter,
+    "map": transformation.MapFilter,
    # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
    # that takes an input and returns a scalar and then should select the max reward,
    # or should implement different filters for different ways of handling a reward model's inference.
@@ -18,14 +22,16 @@ FILTER_REGISTRY = {
 def get_filter(filter_name):
-    return FILTER_REGISTRY[filter_name]
+    if filter_name in FILTER_REGISTRY:
+        return FILTER_REGISTRY[filter_name]
+    else:
+        return filter_name
 def build_filter_ensemble(filter_name, components):
    """
    Create a filtering pipeline.
    """
    filters = []
    for (function, kwargs) in components:
        if kwargs is None:

--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
@@ -9,7 +9,7 @@ class DecontaminationFilter(Filter):
    name = "track_decontamination"
-    def __init__(self, path):
+    def __init__(self, path) -> None:
        """
        TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
@@ -17,7 +17,7 @@ class DecontaminationFilter(Filter):
        """
        self._decontam_results = None
-    def apply(self, reps):
+    def apply(self, resps, docs) -> None:
        """
        Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
        """

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -6,7 +6,9 @@ from lm_eval.api.filter import Filter
 class RegexFilter(Filter):
    """ """
-    def __init__(self, regex_pattern=r"#### (\-?[0-9\.\,]+)", fallback="[invalid]"):
+    def __init__(
+        self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", fallback: str = "[invalid]"
+    ) -> None:
        """
        pass a string `regex` to run `re.compile(r"regex")` on.
        `fallback` defines the output returned if no matches for the regex are located.
@@ -15,7 +17,7 @@ class RegexFilter(Filter):
        self.regex = re.compile(regex_pattern)
        self.fallback = fallback
-    def apply(self, resps):
+    def apply(self, resps, docs):
        # here, we assume we have a list, in which each element is
        # a list of model responses for some particular input/target pair.
        # so we process each of these (same input/target response sets)
@@ -41,12 +43,11 @@ class RegexFilter(Filter):
 class WhitespaceFilter(Filter):
    """ """
-    def __init__(self):
+    def __init__(self) -> None:
        pass
-    def apply(self, resps):
+    def apply(self, resps, docs):
        def filter_set(inst):
            filtered_resp = []
            for resp in inst:
                if resp.startswith(" "):

--- a/lm_eval/filters/selection.py
+++ b/lm_eval/filters/selection.py
@@ -4,12 +4,12 @@ from lm_eval.api.filter import Filter
 class TakeFirstFilter(Filter):
-    def __init__(self):
+    def __init__(self) -> None:
        """
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
        """
-    def apply(self, resps):
+    def apply(self, resps, docs):
        """
        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
        """
@@ -17,13 +17,12 @@ class TakeFirstFilter(Filter):
 class TakeKFilter(Filter):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
        self.k = kwargs.pop("k")
        super().__init__(*args, **kwargs)
-    def apply(self, resps):
+    def apply(self, resps, docs):
        # check we have at least k responses per doc, else we can't take the first k
        assert (
            len(resps[0]) >= self.k
@@ -32,12 +31,12 @@ class TakeKFilter(Filter):
 class MajorityVoteFilter(Filter):
-    def __init__(self):
+    def __init__(self) -> None:
        """
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
        """
-    def apply(self, resps):
+    def apply(self, resps, docs):
        """
        Each entry of `resps` is a list of model responses.
        We select the response that occurs most frequently in each entry of `resps`.

--- a/lm_eval/filters/transformation.py
+++ b/lm_eval/filters/transformation.py
+from lm_eval.api.filter import Filter
+class LowercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.lower() for resp in inst]
+        return [filter_set(resp) for resp in resps]
+class UppercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.upper() for resp in inst]
+        return [filter_set(resp) for resp in resps]
+class MapFilter(Filter):
+    def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
+        """
+        Initializes the MapFilter with a given mapping dictionary and default value.
+        Args:
+        - mapping_dict (dict): A dictionary containing the key-value mappings.
+                               Default is an empty dictionary.
+        - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
+                               Default is None.
+        Example:
+        mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
+        """
+        assert isinstance(
+            mapping_dict, dict
+        ), "Provided mapping_dict is not a dictionary"
+        self.mapping_dict = mapping_dict
+        self.default_value = default_value
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
+        return [filter_set(resp) for resp in resps]
--- a/lm_eval/logger.py
+++ b/lm_eval/logger.py
@@ -6,3 +6,5 @@ logging.basicConfig(
    level=logging.INFO,
 )
 eval_logger = logging.getLogger("lm-eval")
+SPACING = " " * 47
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -76,7 +76,7 @@ class AnthropicLM(LM):
        max_tokens_to_sample: int = 256,
        temperature: float = 0,  # defaults to 1
        **kwargs,  # top_p, top_k, etc.
-    ):
+    ) -> None:
        """Anthropic API wrapper.
        :param model: str
@@ -135,11 +135,10 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
    def tok_decode(self, tokens: List[int]) -> str:
        return self.tokenizer.decode(tokens)
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
    def greedy_until(self, requests) -> List[str]:
        if not requests:
            return []

--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -5,7 +5,7 @@ from lm_eval.api.registry import register_model
 @register_model("dummy")
 class DummyLM(LM):
-    def __init__(self):
+    def __init__(self) -> None:
        super().__init__()
    @classmethod

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
+import os
 import torch
 import transformers
 from transformers.models.auto.modeling_auto import (
@@ -20,7 +22,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
-from accelerate import Accelerator, find_executable_batch_size
+from accelerate import Accelerator, find_executable_batch_size, DistributedType
 from typing import List, Optional, Union
@@ -67,6 +69,7 @@ class HFLM(LM):
        revision: Optional[str] = "main",
        subfolder: Optional[str] = None,
        tokenizer: Optional[str] = None,
+        truncation: Optional[bool] = False,
        max_length: Optional[int] = None,
        device: Optional[str] = "cuda",
        dtype: Optional[Union[str, torch.dtype]] = "auto",
@@ -75,6 +78,7 @@ class HFLM(LM):
        low_cpu_mem_usage: Optional[bool] = True,
        trust_remote_code: Optional[bool] = False,
        use_fast_tokenizer: Optional[bool] = True,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        parallelize: Optional[bool] = False,
@@ -90,7 +94,7 @@ class HFLM(LM):
        bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
        gptq: Optional[Union[bool, str]] = False,
        gptq_use_triton: Optional[bool] = False,
-    ):
+    ) -> None:
        super().__init__()
        assert isinstance(device, str)
@@ -103,17 +107,20 @@ class HFLM(LM):
        if not (parallelize or accelerator.num_processes > 1):
            # use user-passed device
            device_list = set(
-                ["cuda", "cpu", "mps"]
+                ["cuda", "cpu"]
                + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+                + ["mps", "mps:0"]
            )
            if device:
                if device not in device_list:
                    device = int(device)
                self._device = torch.device(device)
                eval_logger.info(f"Using device '{device}'")
-                if device == "mps":
+                if device in ("mps", "mps:0") and "dev" not in torch.__version__:
                    eval_logger.info(
-                        "MPS is still in beta and only supports float32; setting dtype to float32."
+                        "MPS: Setting dtype to float32. To use float16 with MPS, please install a nightly build of "
+                        "PyTorch: pip3 install --pre torch torchvision torchaudio --index-url "
+                        "https://download.pytorch.org/whl/nightly/cpu"
                    )
            else:
                eval_logger.info("Device not specified")
@@ -240,6 +247,8 @@ class HFLM(LM):
            use_fast=use_fast_tokenizer,
        )
+        self.truncation = truncation
        self.vocab_size = self.tokenizer.vocab_size
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
@@ -289,9 +298,16 @@ class HFLM(LM):
                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
                    )
            else:
-                self._model = accelerator.prepare_model(
+                assert accelerator.distributed_type in [
-                    self.model, evaluation_mode=True
+                    DistributedType.FSDP,
-                )
+                    DistributedType.MULTI_GPU,
+                ], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+                if accelerator.distributed_type == DistributedType.FSDP:
+                    self._model = accelerator.prepare(self.model)
+                else:
+                    self._model = accelerator.prepare_model(
+                        self.model, evaluation_mode=True
+                    )
                self._device = torch.device(f"cuda:{accelerator.local_process_index}")
                self.accelerator = accelerator
@@ -334,7 +350,7 @@ class HFLM(LM):
        return self._DEFAULT_MAX_LENGTH
    @property
-    def max_gen_toks(self):
+    def max_gen_toks(self) -> int:
        return 256
    @property
@@ -353,7 +369,7 @@ class HFLM(LM):
    def world_size(self):
        return self._world_size
-    def _detect_batch_size(self, requests=None, pos=0):
+    def _detect_batch_size(self, requests=None, pos: int = 0):
        if requests:
            _, context_enc, continuation_enc = requests[pos]
            max_length = len(
@@ -403,12 +419,13 @@ class HFLM(LM):
        utils.clear_torch_cache()
        return batch_size
-    def tok_encode(self, string: str, left_truncate_len=None):
+    def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None):
        """ """
-        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+        if add_special_tokens is None:
-            add_special_tokens = False
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                add_special_tokens = False
-            add_special_tokens = True
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                add_special_tokens = True
        encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
@@ -419,7 +436,11 @@ class HFLM(LM):
        return encoding
    def tok_batch_encode(
-        self, strings: List[str], padding_side="left", left_truncate_len=None
+        self,
+        strings: List[str],
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
    ):
        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
        old_padding_side = self.tokenizer.padding_side
@@ -432,6 +453,7 @@ class HFLM(LM):
        encoding = self.tokenizer(
            strings,
+            truncation=truncation,
            padding="longest",
            return_tensors="pt",
            add_special_tokens=add_special_tokens,
@@ -487,7 +509,7 @@ class HFLM(LM):
            self.tokenizer, stop, 1, context.shape[0]
        )
        return self.model.generate(
-            context,
+            input_ids=context,
            max_length=max_length,
            stopping_criteria=stopping_criteria,
            pad_token_id=self.eot_token_id,
@@ -518,8 +540,12 @@ class HFLM(LM):
        if n_spaces > 0:
            continuation = context[-n_spaces:] + continuation
            context = context[:-n_spaces]
-        whole_enc = self.tok_encode(context + continuation)
-        context_enc = self.tok_encode(context)
+        whole_enc = self.tok_encode(context + continuation, add_special_tokens=False)
+        context_enc = self.tok_encode(context, add_special_tokens=False)
+        # whole_enc = self.tok_encode(context + continuation)
+        # context_enc = self.tok_encode(context, add_special_tokens=False)
        context_enc_len = len(context_enc)
        continuation_enc = whole_enc[context_enc_len:]
        return context_enc, continuation_enc
@@ -595,7 +621,9 @@ class HFLM(LM):
        return loglikelihoods
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False, override_bs=None):
+    def _loglikelihood_tokens(
+        self, requests, disable_tqdm: bool = False, override_bs=None
+    ):
        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
        res = []
@@ -856,7 +884,9 @@ class HFLM(LM):
                # encode, pad, and truncate contexts for this batch
                context_enc, attn_masks = self.tok_batch_encode(
-                    contexts, left_truncate_len=max_ctx_len
+                    contexts,
+                    left_truncate_len=max_ctx_len,
+                    truncation=self.truncation,
                )
                context_enc = context_enc.to(self.device)
                attn_masks = attn_masks.to(self.device)

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -69,7 +69,7 @@ class OpenaiCompletionsLM(LM):
        engine: str = "text-davinci-003",
        truncate: bool = False,
        batch_size: int = 1,
-    ):
+    ) -> None:
        """
        :param engine: str
@@ -99,12 +99,12 @@ class OpenaiCompletionsLM(LM):
        return self.end_of_text_token_id
    @property
-    def max_length(self):
+    def max_length(self) -> int:
        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
        return 2048
    @property
-    def max_gen_toks(self):
+    def max_gen_toks(self) -> int:
        return 256
    @property
@@ -152,7 +152,7 @@ class OpenaiCompletionsLM(LM):
        return self._loglikelihood_tokens(new_reqs)
    def _loglikelihood_tokens(
-        self, requests, disable_tqdm=False
+        self, requests, disable_tqdm: bool = False
    ) -> List[Tuple[float, bool]]:
        res = []

--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -41,7 +41,7 @@ def textsynth_completion(**kwargs):
 @register_model("textsynth")
 class TextSynthLM(LM):
-    def __init__(self, engine, truncate=False):
+    def __init__(self, engine, truncate: bool = False) -> None:
        """
        :param engine: str
            TextSynth API engine (e.g. `gptj_6B`)
@@ -62,12 +62,12 @@ class TextSynthLM(LM):
        raise NotImplementedError()
    @property
-    def max_length(self):
+    def max_length(self) -> int:
        # NOTE: Turn on truncation to avoid errors on long inputs.
        return 2048
    @property
-    def max_gen_toks(self):
+    def max_gen_toks(self) -> int:
        return 256
    @property

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
+import os
+import ast
+from typing import Dict
 from lm_eval import utils
 from lm_eval.logger import eval_logger
@@ -5,7 +9,7 @@ from lm_eval.logger import eval_logger
 # Stores prompts in a dictionary indexed by 2 levels:
 # prompt category name, and prompt name.
 # This allows us to access prompts
-PROMPT_REGISTRY = {
+PROMPT_REGISTRY: Dict[str, Dict[str, str]] = {
    "qa-basic": {
        "question-newline-answer": "Question: {{question}}\nAnswer:",
        "q-newline-a": "Q: {{question}}\nA:",
@@ -13,7 +17,7 @@ PROMPT_REGISTRY = {
 }
-def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
+def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None):
    # unpack prompt name
    category_name, prompt_name = prompt_id.split(":")
    if subset_name is None:
@@ -44,6 +48,14 @@ def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
            raise ValueError(
                f"{prompt_name} not in prompt list {prompts.all_template_names}"
            )
+    elif ".yaml" in category_name:
+        import yaml
+        with open(category_name, "rb") as file:
+            prompt_yaml_file = yaml.full_load(file)
+        prompt_string = prompt_yaml_file["prompts"][prompt_name]
+        return PromptString(prompt_string)
    else:
        try:
            return PROMPT_REGISTRY[category_name][prompt_name]
@@ -54,15 +66,62 @@ def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
            )
-def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwargs):
+def load_prompt_list(
+    use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
+):
-    from promptsource.templates import DatasetTemplates
+    category_name, prompt_name = use_prompt.split(":")
-    if subset_name is None:
+    if category_name == "promptsource":
-        prompts = DatasetTemplates(dataset_name=dataset_name)
+        from promptsource.templates import DatasetTemplates
-    else:
-        prompts = DatasetTemplates(dataset_name=dataset_name, subset_name=subset_name)
-    category_name, prompt_name = use_prompt.split(":")
+        if subset_name is None:
-    prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
+            prompts = DatasetTemplates(dataset_name=dataset_name)
+        else:
+            prompts = DatasetTemplates(
+                dataset_name=dataset_name, subset_name=subset_name
+            )
+        prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
+    elif ".yaml" in category_name:
+        import yaml
+        if yaml_path is not None:
+            category_name = os.path.realpath(os.path.join(yaml_path, category_name))
+        with open(category_name, "rb") as file:
+            prompt_yaml_file = yaml.full_load(file)
+        prompt_list = utils.pattern_match(
+            prompt_name, prompt_yaml_file["prompts"].keys()
+        )
+    # category_name, *prompt_name = use_prompt.split(":")
+    # TODO allow to multiple prompt naming
+    # if len(prompt_name) > 1:
+    #     prompt_list = []
+    #     for prompt in prompt_name:
+    #         prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
+    # else:
+    #     prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
    return [":".join([category_name, prompt]) for prompt in prompt_list]
+class PromptString:
+    def __init__(self, prompt_string):
+        self.prompt_string = prompt_string
+    def apply(self, doc):
+        doc_to_text = self.prompt_string["doc_to_text"]
+        doc_to_target = self.prompt_string["doc_to_target"]
+        # TODO need a way to process doc_to_choice
+        if "doc_to_choice" in self.prompt_string:
+            raise "Not yet implemented to accept doc_to_choice"
+        text_string = utils.apply_template(doc_to_text, doc)
+        target_string = utils.apply_template(doc_to_target, doc)
+        return [text_string, target_string]