Merge branch 'big-refactor' into wmt

3b4fa26e · Lintang Sutawika · GitHub · d01cc479 · 8f448eed · 3b4fa26e
Unverified Commit 3b4fa26e authored Sep 05, 2023 by Lintang Sutawika Committed by GitHub Sep 05, 2023
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,3 +43,9 @@ repos:
              .*\.json|ignore.txt
          )$
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.5.1
+    hooks:
+    - id: mypy
+      additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
+      exclude: ^tests/.*$
--- a/README.md
+++ b/README.md
@@ -116,8 +116,10 @@ accelerate launch main.py \
 This will perform *data-parallel evaluation*: that is, placing a **single full copy** of your model onto each available GPU and *splitting batches across GPUs* to evaluate on K GPUs K times faster than on one.
-However, if your model *is too large to be run on a single one of your GPUs*, then we provide an alternative method to run these large models: use of the `parallelize` argument.
+If your model is *is too large to be run on a single one of your GPUs* then you can use `accelerate` with Fully Sharded Data Parallel (FSDP) that splits the weights of the model across your data parallel ranks. To enable this, ensure you select `YES` when asked ```Do you want to use FullyShardedDataParallel?``` when running `accelerate config`. To enable memory-efficient loading, select `YES` when asked `Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start?`. This will ensure only the rank 0 process loads the model and then broadcasts the parameters to the other ranks instead of having each rank load all parameters which can lead to large RAM usage spikes around the start of the script that may cause errors.
+We also provide an second method to run these large models: use of the `parallelize` argument.
 ```
 python main.py \
    --model hf \
@@ -132,7 +134,7 @@ To pass even more advanced keyword arguments to `accelerate`, we allow for the f
 - `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM.
 - `offload_folder`: a folder where model weights will be offloaded to disk if needed.
-Using this setting helps for massive models like BLOOM which require, or to avoid exceeding your total system RAM (by default, with `accelerate launch` one copy of the model for each GPU is initialized in RAM before moving it to GPU, resulting in large RAM usage spikes around the start of the script that may cause errors such as `Killed`.) However, it naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.
+Note that this method naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.
 **Note that this option requires launching evaluation via `python main.py` rather than `accelerate launch main.py`.**

--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass
 from typing import List
 from lm_eval.api.instance import Instance
+from datasets import Dataset
 class Filter:
@@ -13,12 +14,12 @@ class Filter:
    """
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
        """
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
        """
-    def apply(self, resps):
+    def apply(self, resps, docs):
        """
        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
@@ -40,14 +41,14 @@ class FilterEnsemble:
    name: str
    filters: List[Filter]
-    def apply(self, instances: List[Instance]):
+    def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:
        resps = [
            inst.resps for inst in instances
        ]  # operate just on the model responses
        for f in self.filters:
            # apply filters in sequence
-            resps = f.apply(resps)
+            resps = f.apply(resps, docs)
        # add the end results after filtering to filtered_requests of their respective source instances.
        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.

--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -19,7 +19,7 @@ class Instance:
    doc_id: str = None
    repeats: str = None
-    def __post_init__(self):
+    def __post_init__(self) -> None:
        # unpack metadata field
        self.task_name, self.doc_id, self.repeats = self.metadata

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -332,7 +332,7 @@ def _sacreformat(refs, preds):
 class _bootstrap_internal:
-    def __init__(self, f, n):
+    def __init__(self, f, n) -> None:
        self.f = f
        self.n = n

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -13,7 +13,7 @@ from lm_eval.logger import eval_logger
 class LM(abc.ABC):
-    def __init__(self):
+    def __init__(self) -> None:
        """Defines the interface that should be implemented by all LM subclasses.
        LMs are assumed to take text (strings) as input and yield strings as output
        (inputs/outputs should be tokenization-agnostic.)
@@ -133,7 +133,7 @@ class LM(abc.ABC):
        # not support multi-device parallelism nor expect it.
        return self._world_size
-    def set_cache_hook(self, cache_hook):
+    def set_cache_hook(self, cache_hook) -> None:
        self.cache_hook = cache_hook
@@ -144,14 +144,14 @@ def hash_args(attr, args):
 class CacheHook:
-    def __init__(self, cachinglm):
+    def __init__(self, cachinglm) -> None:
        if cachinglm is None:
            self.dbdict = None
            return
        self.dbdict = cachinglm.dbdict
-    def add_partial(self, attr, req, res):
+    def add_partial(self, attr, req, res) -> None:
        if self.dbdict is None:
            return
        hsh = hash_args(attr, req)
@@ -159,7 +159,7 @@ class CacheHook:
 class CachingLM:
-    def __init__(self, lm, cache_db):
+    def __init__(self, lm, cache_db) -> None:
        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
        :param lm: LM

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
 class Sampler:
-    def __init__(self, docs, task, fewshot_indices=None, rnd=None):
+    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
        self.rnd = rnd
        assert self.rnd, "must pass rnd to FewShotSampler!"
@@ -19,7 +18,6 @@ class Sampler:
            self.docs = self.docs.select(fewshot_indices)
    def get_context(self, doc, num_fewshot):
        # draw an extra fewshot sample if using same split as evaluating on
        n_samples = (
            num_fewshot + 1
@@ -74,7 +72,7 @@ class Sampler:
 class BalancedSampler(Sampler):
-    def sample(self, n):
+    def sample(self, n) -> None:
        """
        TODO: this should return approximately class-balanced samples from our fewshot examples.
        TODO: what order should they be in? maybe random?
@@ -84,7 +82,7 @@ class BalancedSampler(Sampler):
 class ManualSampler(Sampler):
-    def sample(self, n):
+    def sample(self, n) -> None:
        """ """
        pass

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -88,7 +88,13 @@ class TaskConfig(dict):
    metadata: str = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
-    def __post_init__(self):
+    def __post_init__(self) -> None:
+        if "." in self.dataset_path:
+            import inspect
+            from importlib import import_module
+            self.dataset_path = inspect.getfile(import_module(self.dataset_path))
        if self.generation_kwargs is not None:
            if self.output_type != "greedy_until":
@@ -171,7 +177,7 @@ class Task(abc.ABC):
        cache_dir=None,
        download_mode=None,
        config=None,
-    ):
+    ) -> None:
        """
        :param data_dir: str
            Stores the path to a local folder containing the `Task`'s data files.
@@ -182,7 +188,6 @@ class Task(abc.ABC):
            HuggingFace `datasets` API with the default cache directory located at:
                `~/.cache/huggingface/datasets`
            NOTE: You can change the cache location globally for a given process
-            by setting the shell environment variable, `HF_DATASETS_CACHE`,
            to another directory:
                `export HF_DATASETS_CACHE="/path/to/another/directory"`
        :param download_mode: datasets.DownloadMode
@@ -213,7 +218,7 @@ class Task(abc.ABC):
            list(self.fewshot_docs()), self, rnd=random.Random(1234)
        )
-    def download(self, data_dir=None, cache_dir=None, download_mode=None):
+    def download(self, data_dir=None, cache_dir=None, download_mode=None) -> None:
        """Downloads and returns the task dataset.
        Override this method to download the dataset from a custom API.
@@ -322,7 +327,7 @@ class Task(abc.ABC):
        return rnd.sample(self._training_docs, k)
-    def doc_to_decontamination_query(self, doc):
+    def doc_to_decontamination_query(self, doc) -> None:
        print(
            "Override doc_to_decontamination_query with document specific decontamination query."
        )
@@ -336,7 +341,7 @@ class Task(abc.ABC):
    def doc_to_target(self, doc):
        pass
-    def build_all_requests(self, limit=None, rank=None, world_size=None):
+    def build_all_requests(self, limit=None, rank=None, world_size=None) -> None:
        """Build a set of Instances for a task, and store them in task.instances"""
        if self.has_test_docs():
            docs = self.test_docs()
@@ -472,7 +477,6 @@ class Task(abc.ABC):
                return labeled_examples + str(example)
    def apply_filters(self):
        if hasattr(self, "_filters"):
            for f in self._filters:
                f.apply(self._instances)
@@ -498,7 +502,7 @@ class ConfigurableTask(Task):
    def __init__(
        self, data_dir=None, cache_dir=None, download_mode=None, config: dict = None
-    ):  # TODO no super() call here
+    ) -> None:  # TODO no super() call here
        # Get pre-configured attributes
        self._config = self.CONFIG
@@ -570,7 +574,6 @@ class ConfigurableTask(Task):
                            "aggregation"
                        ]
                else:
                    INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
                    metric_agg = get_default_aggregation(metric_name)
                    eval_logger.warning(
@@ -627,19 +630,19 @@ class ConfigurableTask(Task):
            )
        if self.has_test_docs():
-            docs = self.test_docs()
+            self.task_docs = self.test_docs()
        elif self.has_validation_docs():
-            docs = self.validation_docs()
+            self.task_docs = self.validation_docs()
        else:
            assert (
                False
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
        # Test One Doc
-        self.features = list(docs.features.keys())
+        self.features = list(self.task_docs.features.keys())
        self.multiple_input = 0
        self.multiple_target = 0
-        test_doc = docs[0]
+        test_doc = self.task_docs[0]
        test_text = self.doc_to_text(test_doc)
        test_target = self.doc_to_target(test_doc)
@@ -683,8 +686,7 @@ class ConfigurableTask(Task):
                    f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
                )
-    def download(self, dataset_kwargs=None):
+    def download(self, dataset_kwargs=None) -> None:
        self.dataset = datasets.load_dataset(
            path=self.DATASET_PATH,
            name=self.DATASET_NAME,
@@ -743,6 +745,15 @@ class ConfigurableTask(Task):
                )
            return super().fewshot_docs()
+    def apply_filters(self):
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances, self.task_docs)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
    def should_decontaminate(self):
        return self._config.should_decontaminate
@@ -767,7 +778,6 @@ class ConfigurableTask(Task):
        return doc
    def doc_to_text(self, doc):
        if self.prompt is not None:
            doc_to_text = self.prompt
        else:
@@ -783,7 +793,7 @@ class ConfigurableTask(Task):
                return doc[doc_to_text]
            else:
                text_string = utils.apply_template(doc_to_text, doc)
-                if text_string.isdigit():
+                if text_string.isdigit() and self._config.doc_to_choice is not None:
                    return ast.literal_eval(text_string)
                else:
                    return text_string
@@ -802,7 +812,6 @@ class ConfigurableTask(Task):
            raise TypeError
    def doc_to_target(self, doc: dict) -> Union[int, str, list]:
        if self.prompt is not None:
            doc_to_target = self.prompt
        else:
@@ -818,7 +827,7 @@ class ConfigurableTask(Task):
                return doc[doc_to_target]
            else:
                target_string = utils.apply_template(doc_to_target, doc)
-                if target_string.isdigit():
+                if target_string.isdigit() and self._config.doc_to_choice is not None:
                    return ast.literal_eval(target_string)
                elif (
                    len(target_string) >= 2
@@ -844,7 +853,6 @@ class ConfigurableTask(Task):
            raise TypeError
    def doc_to_choice(self, doc: Any) -> List[str]:
        if self.prompt is not None:
            doc_to_choice = self.prompt
        elif self._config.doc_to_choice is None:
@@ -888,13 +896,11 @@ class ConfigurableTask(Task):
    def construct_requests(
        self, doc: dict, ctx: str, **kwargs
    ) -> Union[List[Instance], Instance]:
        if self.OUTPUT_TYPE == "loglikelihood":
            arguments = (ctx, self.doc_to_target(doc))
        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
            arguments = (self.doc_to_target(doc),)
        elif self.OUTPUT_TYPE == "multiple_choice":
            choices = self.doc_to_choice(doc)
            target_delimiter = self._config.target_delimiter
            if self.multiple_input:
@@ -945,7 +951,6 @@ class ConfigurableTask(Task):
        )
    def process_results(self, doc, results):
        if callable(self._config.process_results):
            return self._config.process_results(doc, results)
@@ -980,7 +985,6 @@ class ConfigurableTask(Task):
                ),
            }
        elif self.OUTPUT_TYPE == "multiple_choice":
            lls, is_greedy = zip(*results)
            # retrieve choices in List[str] form, to compute choice lengths, etc.
@@ -1005,18 +1009,36 @@ class ConfigurableTask(Task):
                gold = self.doc_to_text(doc)
            else:
                gold = self.doc_to_target(doc)
-                if type(gold) is str:
-                    gold = choices.index(gold)
+            gold_index_error = False
+            if type(gold) is list:
+                gold = [i if i < len(choices) else -100 for i in gold]
+                if -100 in gold:
+                    gold_index_error = True
+            else:
+                if type(gold) is int:
+                    gold = gold if gold < len(choices) else -100
+                elif type(gold) is str:
+                    gold = choices.index(gold) if gold in choices else -100
+                if gold == -100:
+                    gold_index_error = True
+            if gold_index_error:
+                eval_logger.warning(
+                    f"Label index was not in within range of available choices,"
+                    f"Sample:\n\n{doc}\n\n"
+                )
            if self.multiple_target:
                acc = 1.0 if pred in gold else 0.0
                acc_norm = 1.0 if pred_norm in gold else 0.0
-                exact_match = int(any([is_greedy[i] for i in gold]))
+                exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
            else:
                acc = 1.0 if pred == gold else 0.0
                acc_norm = 1.0 if pred_norm == gold else 0.0
                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
-                exact_match = int(is_greedy[gold])
+                exact_match = int(is_greedy[gold]) if gold != -100 else 0
            result_dict = {
                **({"acc": acc} if "acc" in use_metric else {}),
@@ -1034,7 +1056,6 @@ class ConfigurableTask(Task):
                result_dict["acc_mutual_info"] = acc_mutual_info
        elif self.OUTPUT_TYPE == "greedy_until":
            gold = self.doc_to_target(doc)
            if self._config.doc_to_choice is not None:
                # If you set doc_to_choice,
@@ -1172,7 +1193,7 @@ class PerplexityTask(Task):
    def doc_to_decontamination_query(self, doc):
        return doc
-    def doc_to_text(self, doc):
+    def doc_to_text(self, doc) -> str:
        return ""
    def doc_to_target(self, doc):

--- a/lm_eval/benchmarks/__init__.py
+++ b/lm_eval/benchmarks/__init__.py
@@ -11,8 +11,7 @@ from lm_eval.api.registry import (
 )
-def include_benchmarks(task_dir):
+def include_benchmarks(task_dir: str) -> None:
    for root, subdirs, file_list in os.walk(task_dir):
        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
            for f in file_list:

--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
 import os
+from typing import Any
 import zstandard
 import json
 import jsonlines
@@ -9,7 +10,7 @@ import tqdm
 from pathlib import Path
-def json_serial(obj):
+def json_serial(obj: Any) -> str:
    """JSON serializer for objects not serializable by default json code"""
    if isinstance(obj, (datetime.datetime,)):
@@ -19,7 +20,7 @@ def json_serial(obj):
 # Modified version of lm_dataformat Archive for single file.
 class Archive:
-    def __init__(self, file_path, compression_level=3):
+    def __init__(self, file_path: str, compression_level: int = 3) -> None:
        self.file_path = file_path
        dir_name = os.path.dirname(file_path)
        if dir_name:
@@ -28,7 +29,7 @@ class Archive:
        self.cctx = zstandard.ZstdCompressor(level=compression_level)
        self.compressor = self.cctx.stream_writer(self.fh)
-    def add_data(self, data, meta={}):
+    def add_data(self, data, meta={}) -> None:
        self.compressor.write(
            json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
                "UTF-8"
@@ -36,7 +37,7 @@ class Archive:
            + b"\n"
        )
-    def commit(self):
+    def commit(self) -> None:
        self.compressor.flush(zstandard.FLUSH_FRAME)
        self.fh.flush()
        self.fh.close()
@@ -44,10 +45,16 @@ class Archive:
 # Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
 class Reader:
-    def __init__(self):
+    def __init__(self) -> None:
        pass
-    def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"):
+    def read(
+        self,
+        file,
+        get_meta: bool = False,
+        autojoin_paragraphs: bool = True,
+        para_joiner: str = "\n\n",
+    ):
        with open(file, "rb") as fh:
            self.fh = fh
            cctx = zstandard.ZstdDecompressor()
@@ -72,7 +79,7 @@ class Reader:
 class TextArchive:
-    def __init__(self, file_path, mode="rb+"):
+    def __init__(self, file_path, mode: str = "rb+") -> None:
        self.file_path = file_path
        dir_name = os.path.dirname(file_path)
        if dir_name:
@@ -83,21 +90,21 @@ class TextArchive:
        self.fh = open(self.file_path, mode)
-    def add_data(self, data):
+    def add_data(self, data) -> None:
        self.fh.write(data.encode("UTF-8") + b"\n")
-    def commit(self):
+    def commit(self) -> None:
        self.fh.flush()
        self.fh.close()
 class TextReader:
-    def __init__(self, file_path):
+    def __init__(self, file_path) -> None:
        self.file_path = file_path
    # Optimized mmap read with infrequent tqdm updates to maintain speed
    # Tested up to 250MB/s.
-    def read_tqdm(self, update_frequency=10000):
+    def read_tqdm(self, update_frequency: int = 10000):
        current_file_position = 0
        line_counter = 0
        with open(self.file_path, "r") as fh, tqdm.tqdm(
@@ -149,7 +156,7 @@ class TextReader:
 # Optimized for speed. Decompresses the archive in shell before
 # using the mmap'd TextReader.
 class ZStdTextReader:
-    def __init__(self, file):
+    def __init__(self, file) -> None:
        self.file = file
    def read_tqdm(self):

--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
@@ -11,7 +11,7 @@ from .archiver import ZStdTextReader
 # Was used for testing the evaluator decoupled from the full logic below
-def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
+def get_train_overlap_stub(docs: dict, ngrams_path: str, ngrams_n_size: str):
    simulated_overlap = 0.1
    contaminated = int(len(docs) * simulated_overlap)
    return random.sample(range(len(docs)), contaminated)
@@ -25,6 +25,7 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
 # scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
 # files. These should exist in the "ngrams_path" provided to this function.
 # Algorithm:
 # 1. Build lookups for each dataset {ngram: list(document_ids)}
 # 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
@@ -33,7 +34,7 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
 # 4. Strip the task_set from the dictionary keys and return
 #
 # We cache the task+set lookups as well as the overlaps.
-def get_train_overlap(docs_by_task_set, ngrams_path, limit):
+def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> dict:
    # return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)
    info_dict_path = os.path.join(ngrams_path, "info.json")
@@ -46,7 +47,7 @@ def get_train_overlap(docs_by_task_set, ngrams_path, limit):
    print("Building Lookups...")
    start = time.perf_counter()
-    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
+    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
        return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"
    lookups = {}

--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
 import re
 import string
-import timeit
 import pickle
 import traceback
 from pprint import pprint
+from typing import Iterator, Sequence, TypeVar
 # This is a cpp module. Compile janitor_util.cpp with:
 # c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
@@ -16,10 +16,12 @@ except Exception:
    traceback.print_exc()
    JANITOR_CPP = False
+T = TypeVar("T")
 # Implementation from nltk source
 # https://www.nltk.org/_modules/nltk/util.html
-def form_ngrams(sequence, n):
+def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[tuple[T, ...]]:
    history = []
    while n > 1:
        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
@@ -36,7 +38,7 @@ def form_ngrams(sequence, n):
        del history[0]
-def word_ngrams(s, n):
+def word_ngrams(s: str, n: int) -> Iterator[str]:
    """Splits a string into ngram words"""
    tokens = s.split()  # not a generator :(
    ngram_seqs = form_ngrams(iter(tokens), n)
@@ -68,14 +70,14 @@ def word_ngrams(s, n):
 # https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
-def split_indices(s):
+def split_indices(s: str) -> Iterator[tuple[str, tuple[int, int]]]:
    """Splits a string on whitespaces and records the indices of each in the original string.
    @:return generator((word, (start_idx, end_idx)), ...)
    """
    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
-def word_ngrams_indices(s, n):
+def word_ngrams_indices(s: str, n: int) -> Iterator[tuple[str, tuple[int, int]]]:
    """Splits a string into pairs of (ngram words, their start/end indices)"""
    tokens_with_indices = split_indices(s)
@@ -104,16 +106,15 @@ def word_ngrams_indices(s, n):
 class Janitor:
    # FIXME delete_chars: Should anything else go here? Special chars?
    def __init__(
        self,
-        ngram_n=13,
+        ngram_n: int = 13,
-        window_to_remove=200,
+        window_to_remove: int = 200,
-        too_dirty_cutoff=10,
+        too_dirty_cutoff: int = 10,
-        minimum_slice_length=200,
+        minimum_slice_length: int = 200,
-        delete_chars=string.punctuation,
+        delete_chars: str = string.punctuation,
-    ):
+    ) -> None:
        self.ngram_n = ngram_n
        self.window_to_remove = window_to_remove
        self.too_dirty_cutoff = too_dirty_cutoff
@@ -135,11 +136,11 @@ class Janitor:
    # I/O for saving contamination ngrams
    ##############
-    def save_contamination_ngrams(self, filename):
+    def save_contamination_ngrams(self, filename: str) -> None:
        with open(filename, "wb") as fp:
            pickle.dump(filename, fp)
-    def load_contamination_ngrams(self, filename):
+    def load_contamination_ngrams(self, filename: str) -> None:
        with open(filename, "rb") as fp:
            self.dirt_ngrams = pickle.load(fp)
@@ -147,7 +148,7 @@ class Janitor:
    # Call these :)
    ##############
-    def register_contaminant(self, dirt_string):
+    def register_contaminant(self, dirt_string: str) -> None:
        """Register a string as contamination to be removed, e.g. a test set
        This breaks the dirt_string into ngrams to store for future cleaning"""
        if JANITOR_CPP:
@@ -156,7 +157,7 @@ class Janitor:
            print("WARNING: Janitor running in python mode")
            return self.register_contaminant_python(dirt_string)
-    def clean(self, dirty_string):
+    def clean(self, dirty_string: str) -> list[str]:
        """Clean a string (e.g. a training set) by removing all ngrams previously
        registered as contaminants. Returns a list of clean chunks, or empty if
        the string was too dirty"""
@@ -166,7 +167,9 @@ class Janitor:
            print("WARNING: Janitor running in python mode")
            return self.clean_python(dirty_string)
-    def _split_chunks(self, dirty_string, dirty_parts):
+    def _split_chunks(
+        self, dirty_string: str, dirty_parts: Sequence[tuple]
+    ) -> list[str]:
        clean_chunks = []
        splice_idx = 0
        end = -1
@@ -189,12 +192,12 @@ class Janitor:
    # Fast C++
    ##############
-    def register_contaminant_cpp(self, dirt_string):
+    def register_contaminant_cpp(self, dirt_string) -> None:
        self.dirt_ngrams.update(
            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
        )
-    def clean_cpp(self, dirty_string):
+    def clean_cpp(self, dirty_string: str) -> list[str]:
        contamination_indices = janitor_util.clean_ngram_with_indices(
            dirty_string, self.delete_chars, self.ngram_n
        )
@@ -204,15 +207,15 @@ class Janitor:
    # Slow python
    ##############
-    def normalize_string(self, s):
+    def normalize_string(self, s: str) -> str:
        return s.translate(self.translation_table)
-    def register_contaminant_python(self, dirt_string):
+    def register_contaminant_python(self, dirt_string: str) -> None:
        self.dirt_ngrams.update(
            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
        )
-    def clean_python(self, dirty_string):
+    def clean_python(self, dirty_string: str) -> list[str]:
        contamination_indices = (
            (None, *idx_pair)
            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -42,11 +42,11 @@ def simple_evaluate(
    device=None,
    use_cache=None,
    limit=None,
-    bootstrap_iters=100000,
+    bootstrap_iters: int = 100000,
-    check_integrity=False,
+    check_integrity: bool = False,
    decontamination_ngrams_path=None,
-    write_out=False,
+    write_out: bool = False,
-    log_samples=True,
+    log_samples: bool = True,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -117,7 +117,6 @@ def simple_evaluate(
    task_dict = lm_eval.tasks.get_task_dict(tasks)
    for task_name in task_dict.keys():
        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
            group, task_obj = task_obj
@@ -175,10 +174,10 @@ def evaluate(
    lm,
    task_dict,
    limit=None,
-    bootstrap_iters=100000,
+    bootstrap_iters: int = 100000,
    decontamination_ngrams_path=None,
-    write_out=False,
+    write_out: bool = False,
-    log_samples=True,
+    log_samples: bool = True,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -223,7 +222,6 @@ def evaluate(
    # get lists of each type of request
    for task_name, task in task_dict.items():
        if type(task) == tuple:
            group, task = task
            task_groups[task_name] = group
@@ -350,7 +348,6 @@ def evaluate(
        # if multigpu, then gather data across all ranks
        # first gather logged samples across all ranks
        for task_name, task_samples in list(samples.items()):
            full_samples = [None] * lm.world_size
            torch.distributed.all_gather_object(full_samples, task_samples)
@@ -359,7 +356,6 @@ def evaluate(
        # then collect metrics across all ranks
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
            numitem = 0
            if type(items[0]) == tuple:
                numitem = len(items[0])

--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -17,14 +17,16 @@ FILTER_REGISTRY = {
 def get_filter(filter_name):
-    return FILTER_REGISTRY[filter_name]
+    if filter_name in FILTER_REGISTRY:
+        return FILTER_REGISTRY[filter_name]
+    else:
+        return filter_name
 def build_filter_ensemble(filter_name, components):
    """
    Create a filtering pipeline.
    """
    filters = []
    for (function, kwargs) in components:
        if kwargs is None:

--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
@@ -9,7 +9,7 @@ class DecontaminationFilter(Filter):
    name = "track_decontamination"
-    def __init__(self, path):
+    def __init__(self, path) -> None:
        """
        TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
@@ -17,7 +17,7 @@ class DecontaminationFilter(Filter):
        """
        self._decontam_results = None
-    def apply(self, reps):
+    def apply(self, resps, docs) -> None:
        """
        Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
        """

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -6,7 +6,9 @@ from lm_eval.api.filter import Filter
 class RegexFilter(Filter):
    """ """
-    def __init__(self, regex_pattern=r"#### (\-?[0-9\.\,]+)", fallback="[invalid]"):
+    def __init__(
+        self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", fallback: str = "[invalid]"
+    ) -> None:
        """
        pass a string `regex` to run `re.compile(r"regex")` on.
        `fallback` defines the output returned if no matches for the regex are located.
@@ -15,7 +17,7 @@ class RegexFilter(Filter):
        self.regex = re.compile(regex_pattern)
        self.fallback = fallback
-    def apply(self, resps):
+    def apply(self, resps, docs):
        # here, we assume we have a list, in which each element is
        # a list of model responses for some particular input/target pair.
        # so we process each of these (same input/target response sets)
@@ -41,12 +43,11 @@ class RegexFilter(Filter):
 class WhitespaceFilter(Filter):
    """ """
-    def __init__(self):
+    def __init__(self) -> None:
        pass
-    def apply(self, resps):
+    def apply(self, resps, docs):
        def filter_set(inst):
            filtered_resp = []
            for resp in inst:
                if resp.startswith(" "):

--- a/lm_eval/filters/selection.py
+++ b/lm_eval/filters/selection.py
@@ -4,12 +4,12 @@ from lm_eval.api.filter import Filter
 class TakeFirstFilter(Filter):
-    def __init__(self):
+    def __init__(self) -> None:
        """
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
        """
-    def apply(self, resps):
+    def apply(self, resps, docs):
        """
        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
        """
@@ -17,13 +17,12 @@ class TakeFirstFilter(Filter):
 class TakeKFilter(Filter):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
        self.k = kwargs.pop("k")
        super().__init__(*args, **kwargs)
-    def apply(self, resps):
+    def apply(self, resps, docs):
        # check we have at least k responses per doc, else we can't take the first k
        assert (
            len(resps[0]) >= self.k
@@ -32,12 +31,12 @@ class TakeKFilter(Filter):
 class MajorityVoteFilter(Filter):
-    def __init__(self):
+    def __init__(self) -> None:
        """
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
        """
-    def apply(self, resps):
+    def apply(self, resps, docs):
        """
        Each entry of `resps` is a list of model responses.
        We select the response that occurs most frequently in each entry of `resps`.

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -76,7 +76,7 @@ class AnthropicLM(LM):
        max_tokens_to_sample: int = 256,
        temperature: float = 0,  # defaults to 1
        **kwargs,  # top_p, top_k, etc.
-    ):
+    ) -> None:
        """Anthropic API wrapper.
        :param model: str
@@ -135,11 +135,10 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
    def tok_decode(self, tokens: List[int]) -> str:
        return self.tokenizer.decode(tokens)
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
    def greedy_until(self, requests) -> List[str]:
        if not requests:
            return []

--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -5,7 +5,7 @@ from lm_eval.api.registry import register_model
 @register_model("dummy")
 class DummyLM(LM):
-    def __init__(self):
+    def __init__(self) -> None:
        super().__init__()
    @classmethod

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
+import os
 import torch
 import transformers
 from transformers.models.auto.modeling_auto import (
@@ -20,7 +22,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
-from accelerate import Accelerator, find_executable_batch_size
+from accelerate import Accelerator, find_executable_batch_size, DistributedType
 from typing import List, Optional, Union
@@ -67,6 +69,7 @@ class HFLM(LM):
        revision: Optional[str] = "main",
        subfolder: Optional[str] = None,
        tokenizer: Optional[str] = None,
+        truncation: Optional[bool] = False,
        max_length: Optional[int] = None,
        device: Optional[str] = "cuda",
        dtype: Optional[Union[str, torch.dtype]] = "auto",
@@ -75,6 +78,7 @@ class HFLM(LM):
        low_cpu_mem_usage: Optional[bool] = True,
        trust_remote_code: Optional[bool] = False,
        use_fast_tokenizer: Optional[bool] = True,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        parallelize: Optional[bool] = False,
@@ -90,7 +94,7 @@ class HFLM(LM):
        bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
        gptq: Optional[Union[bool, str]] = False,
        gptq_use_triton: Optional[bool] = False,
-    ):
+    ) -> None:
        super().__init__()
        assert isinstance(device, str)
@@ -240,6 +244,8 @@ class HFLM(LM):
            use_fast=use_fast_tokenizer,
        )
+        self.truncation = truncation
        self.vocab_size = self.tokenizer.vocab_size
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
@@ -289,9 +295,16 @@ class HFLM(LM):
                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
                    )
            else:
-                self._model = accelerator.prepare_model(
+                assert accelerator.distributed_type in [
-                    self.model, evaluation_mode=True
+                    DistributedType.FSDP, 
-                )
+                    DistributedType.MULTI_GPU
+                ], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+                if accelerator.distributed_type == DistributedType.FSDP:
+                    self._model = accelerator.prepare(self.model)
+                else:
+                    self._model = accelerator.prepare_model(
+                        self.model, evaluation_mode=True 
+                    )
                self._device = torch.device(f"cuda:{accelerator.local_process_index}")
                self.accelerator = accelerator
@@ -334,7 +347,7 @@ class HFLM(LM):
        return self._DEFAULT_MAX_LENGTH
    @property
-    def max_gen_toks(self):
+    def max_gen_toks(self) -> int:
        return 256
    @property
@@ -353,7 +366,7 @@ class HFLM(LM):
    def world_size(self):
        return self._world_size
-    def _detect_batch_size(self, requests=None, pos=0):
+    def _detect_batch_size(self, requests=None, pos: int = 0):
        if requests:
            _, context_enc, continuation_enc = requests[pos]
            max_length = len(
@@ -419,7 +432,11 @@ class HFLM(LM):
        return encoding
    def tok_batch_encode(
-        self, strings: List[str], padding_side="left", left_truncate_len=None
+        self, 
+        strings: List[str], 
+        padding_side: str = "left", 
+        left_truncate_len: int = None,
+        truncation: bool = False,
    ):
        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
        old_padding_side = self.tokenizer.padding_side
@@ -432,6 +449,7 @@ class HFLM(LM):
        encoding = self.tokenizer(
            strings,
+            truncation=truncation,
            padding="longest",
            return_tensors="pt",
            add_special_tokens=add_special_tokens,
@@ -595,7 +613,9 @@ class HFLM(LM):
        return loglikelihoods
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False, override_bs=None):
+    def _loglikelihood_tokens(
+        self, requests, disable_tqdm: bool = False, override_bs=None
+    ):
        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
        res = []
@@ -856,7 +876,9 @@ class HFLM(LM):
                # encode, pad, and truncate contexts for this batch
                context_enc, attn_masks = self.tok_batch_encode(
-                    contexts, left_truncate_len=max_ctx_len
+                    contexts,
+                    left_truncate_len=max_ctx_len,
+                    truncation=self.truncation,
                )
                context_enc = context_enc.to(self.device)
                attn_masks = attn_masks.to(self.device)