Merge branch 'big-refactor' into wmt

3b4fa26e · Lintang Sutawika · GitHub · d01cc479 · 8f448eed · 3b4fa26e
Unverified Commit 3b4fa26e authored Sep 05, 2023 by Lintang Sutawika Committed by GitHub Sep 05, 2023
18 changed files
--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
@@ -6,9 +6,9 @@ dataset_name: copa
 training_split: train
 validation_split: validation
 output_type: greedy_until
-doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} question: {{question}}"
+doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
 doc_to_target: label
-doc_to_choice: ['False', 'True']
+doc_to_choice: ['choice1', 'choice2']
 metric_list:
  - metric: exact_match
    aggregation: mean

--- a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-multirc-t5-prompt
+dataset_path: super_glue
+dataset_name: multirc
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
+doc_to_target: label
+doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.5
+metric_list:
+  - metric: !function t5_utils.f1
+    aggregation: !function t5_utils.agg_f1
+    higher_is_better: true
+  - metric: !function t5_utils.em
+    aggregation: !function t5_utils.agg_em
+    higher_is_better: true
--- a/lm_eval/tasks/super_glue/multirc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/multirc/t5_utils.py
+import collections
+
+import numpy as np
+import sklearn.metrics
+
+
+def f1(predictions, references):  # This is a passthrough function
+
+    _prediction = predictions[0]
+    _reference = references[0].split("_")[-1]
+    string_label = ["False", "True"]
+    reference = string_label.index(_reference)
+    prediction = (
+        string_label.index(_prediction)
+        if _prediction in string_label
+        else not bool(reference)
+    )
+
+    return (prediction, reference)
+
+
+def agg_f1(items):
+
+    predictions, references = zip(*items)
+    references, predictions = np.asarray(references), np.asarray(predictions)
+
+    return sklearn.metrics.f1_score(references, predictions)
+
+
+def em(predictions, references):  # This is a passthrough function
+
+    _prediction = predictions[0]
+    _group, _reference = references[0].split("_")
+    string_label = ["False", "True"]
+    reference = string_label.index(_reference)
+    prediction = (
+        string_label.index(_prediction)
+        if _prediction in string_label
+        else not bool(reference)
+    )
+
+    return (_group, prediction, reference)
+
+
+def agg_em(items):
+    grouped_values = collections.defaultdict(lambda: ([], []))
+    for group, prediction, reference in items:
+        grouped_values[group][0].append(reference)
+        grouped_values[group][1].append(prediction)
+
+    group_scores = []
+    for group, (targets, predictions) in grouped_values.items():
+        score = float(np.array_equal(targets, predictions))
+        group_scores.append(score)
+
+    return np.mean(group_scores)
--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
@@ -3,14 +3,15 @@ group:
 task: super_glue-record-t5-prompt
 dataset_path: super_glue
 dataset_name: record
-training_split: train
 validation_split: validation
 output_type: greedy_until
-doc_to_text: "record query: {{query}} entities: {{entities}} passage: {{passage}}"
-doc_to_target: "{{answers}}"
+process_docs: !function t5_utils.process_docs
+doc_to_text: !function t5_utils.doc_to_text
+doc_to_target: "{{idx.passage|string}}+{{idx.query}}_{{answers}}"
 metric_list:
-  - metric: exact_match
-    aggregation: mean
+  - metric: !function t5_utils.em
+    aggregation: !function t5_utils.squad_em_agg
+    higher_is_better: true
+  - metric: !function t5_utils.f1
+    aggregation: !function t5_utils.squad_f1_agg
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/record/t5_utils.py
+++ b/lm_eval/tasks/super_glue/record/t5_utils.py
+import re
+import string
+import collections
+import numpy as np
+
+from tqdm import tqdm
+from datasets import Dataset, concatenate_datasets
+
+from lm_eval.api.metrics import metric_max_over_ground_truths
+
+
+def doc_to_text(doc):
+
+    passage = doc["passage"]
+    passage = re.sub(r"(\.|\?|\!|\"|\')\n@highlight\n", r"\1 ", passage)
+    passage = re.sub(r"\n@highlight\n", ". ", passage)
+
+    return " ".join(
+        [
+            "record query:",
+            doc["query"],
+            "entities:",
+            ", ".join(doc["entities"]),
+            "passage:",
+            passage,
+        ]
+    )
+
+
+def process_docs(dataset):
+    def split_answers(doc):
+        split_doc = {
+            **{k: [] for k in doc.keys()},
+        }
+        answers = doc.pop("answers")
+        for idx, answer in enumerate(answers):
+
+            for key in split_doc.keys():
+                if key in doc:
+                    split_doc[key].append(doc[key])
+
+            split_doc["answers"].append(answer)
+        return split_doc
+
+    dataset = dataset.map(split_answers)
+    new_dataset = {}
+    for key in dataset.features.keys():
+        new_dataset[key] = [x for row in dataset[key] for x in row]
+
+    return Dataset.from_dict(new_dataset)
+
+
+def normalize_squad(answer):
+    """Normalization used in official SQuAD evaluation script."""
+
+    def _normalize_answer(text, punc_chars, punc_repl):
+        """Lower text and remove punctuation, articles and extra whitespace."""
+
+        def remove_articles(s):
+            return re.sub(r"\b(a|an|the)\b", " ", s)
+
+        def replace_punctuation(s):
+            to_replace = set(punc_chars)
+            return "".join(punc_repl if ch in to_replace else ch for ch in s)
+
+        def white_space_fix(s):
+            return " ".join(s.split())
+
+        text = text.lower()
+        text = replace_punctuation(text)
+        text = remove_articles(text)
+        text = white_space_fix(text)
+
+        return text
+
+    return _normalize_answer(answer, punc_chars=string.punctuation, punc_repl="")
+
+
+def em(predictions, references):  # This is a passthrough function
+    return (predictions[0], references[0])
+
+
+def f1(predictions, references):  # This is a passthrough function
+    return (predictions[0], references[0])
+
+
+def squad_em_agg(items):
+    def _exact_match_score(prediction, target):
+        return target == prediction
+
+    grouped_values = collections.defaultdict(lambda: ([], []))
+    for prediction, reference in items:
+        group, reference = reference.split("_")
+        # if group not in grouped_values:
+        grouped_values[group][0].append(normalize_squad(prediction))
+        grouped_values[group][1].append(normalize_squad(reference))
+
+    em = []
+    for group in grouped_values.keys():
+        predictions, targets = grouped_values[group]
+        for p in predictions:
+            em.append(metric_max_over_ground_truths(_exact_match_score, p, targets))
+
+    return np.mean(em)
+
+
+def squad_f1_agg(items):
+    def _f1_score(prediction, target):
+        """Computes token f1 score for a single target and prediction."""
+        prediction_tokens = prediction.split()
+        target_tokens = target.split()
+        common = collections.Counter(prediction_tokens) & collections.Counter(
+            target_tokens
+        )
+        num_same = sum(common.values())
+        if num_same == 0:
+            return 0
+        precision = 1.0 * num_same / len(prediction_tokens)
+        recall = 1.0 * num_same / len(target_tokens)
+        f1 = (2 * precision * recall) / (precision + recall)
+        return f1
+
+    grouped_values = collections.defaultdict(lambda: ([], []))
+    for prediction, reference in items:
+        group, reference = reference.split("_")
+        if group not in grouped_values:
+            grouped_values[group][0].append(normalize_squad(prediction))
+        grouped_values[group][1].append(normalize_squad(reference))
+
+    f1 = []
+    for group in grouped_values.keys():
+        p, t = grouped_values[group]
+        f1.append(metric_max_over_ground_truths(_f1_score, p[0], t))
+
+    return np.mean(f1)
--- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-rte-t5-prompt
+dataset_path: super_glue
+dataset_name: rte
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
+doc_to_target: label
+doc_to_choice: ['entailment', 'not_entailment']
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-wic-t5-prompt
+dataset_path: super_glue
+dataset_name: wic
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
+doc_to_target: label
+doc_to_choice: ['False', 'True']
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wsc/default.yaml
+++ b/lm_eval/tasks/super_glue/wsc/default.yaml
@@ -2,7 +2,7 @@ group:
  - super-glue-lm-eval-v1
 task: wsc
 dataset_path: super_glue
-dataset_name: wsc
+dataset_name: wsc.fixed
 output_type: multiple_choice
 training_split: train
 validation_split: validation

--- a/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
+++ b/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
-import re
 from lm_eval.utils import general_detokenize


-def t5_prompt_doc_to_text(x):
-    def _mark_span(text, span_str, span_idx, mark):
-        pattern_tmpl = r"^((?:\S+\s){N})(W)"
-        pattern = re.sub("N", str(span_idx), pattern_tmpl)
-        pattern = re.sub("W", span_str, pattern)
-        return re.sub(pattern, r"\1{0} \2 {0}".format(mark), text)
-
-    text = x["text"]
-    text = _mark_span(text, x["span1_text"], x["span1_index"], "*")
-    # Compensate for 2 added "words" added in previous step.
-    span2_index = x["span2_index"] + 2 * (x["span1_index"] < x["span2_index"])
-    text = _mark_span(text, x["span2_text"], span2_index, "#")
-
-    return text
-
-
 def default_doc_to_text(x):
    raw_passage = x["text"]
    # NOTE: HuggingFace span indices are word-based not character-based.

--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -2,16 +2,17 @@ group:
  - super-glue-t5-prompt
 task: super_glue-wsc-t5-prompt
 dataset_path: super_glue
-dataset_name: wsc
+dataset_name: wsc.fixed
 training_split: train
 validation_split: validation
 output_type: greedy_until
-doc_to_text: !function "preprocess_wsc.t5_prompt_doc_to_text"
+doc_to_text: !function "t5_utils.doc_to_text"
 doc_to_target: label
-doc_to_choice: ['False', 'True']
 metric_list:
-  - metric: exact_match
+  - metric: accuracy
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+filter_list:
+  - name: "wsc_postprocessor"
+    filter:
+      - function: !function t5_utils.WSCPostprocess
--- a/lm_eval/tasks/super_glue/wsc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/wsc/t5_utils.py
+import re
+from lm_eval.api.filter import Filter
+
+
+def doc_to_text(x):
+    text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
+    return "wsc: " + text
+
+
+def _wsc_inputs(x):
+
+    words = x["text"].split(" ")
+
+    # We would need some special logic to handle the case where the pronoun is the
+    # first or last word in the text. None of the examples in WSC seem to have
+    # this, so we are ignoring these cases.
+    assert x["span2_index"] > 0
+    assert x["span2_index"] < len(words)
+    pronoun_index = x["span2_index"]
+
+    def create_input():
+        assert words[pronoun_index] == x["span2_text"]
+
+        return " ".join(
+            [
+                " ".join(words[:pronoun_index]),
+                "X",
+                " ".join(words[pronoun_index + 1 :]),
+            ]
+        )
+
+    # Handle some special cases.
+    if (
+        x["text"]
+        == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
+    ):
+        return (
+            "The boy continued to whip the pony , and eventually the pony threw "
+            'him over. John laughed out quite loud. "Good for X ," he said.'
+        )
+
+    # Using the span2_index, we get 'use' instead of 'it'.
+    if (
+        x["text"]
+        == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
+    ):
+        return (
+            "When they had eventually calmed down a bit , and had gotten home, "
+            "Mr. Farley put the magic pebble in an iron safe . Some day they might "
+            "want to use X , but really for now, what more could they wish for?"
+        )
+
+    return create_input()
+
+
+class WSCPostprocess(Filter):
+    def __init__(self, **kwargs):
+
+        self.determiners = {
+            "a",
+            "an",
+            "few",
+            "her",
+            "his",
+            "each",
+            "every",
+            "many",
+            "much",
+            "my",
+            "our",
+            "some",
+            "that",
+            "the",
+            "their",
+            "these",
+            "this",
+            "those",
+            "which",
+            "whose",
+            "your",
+        }
+
+    def clean(self, s):
+        """Ignore capitalization and determiners."""
+        s = s.strip().lower()
+        return " ".join([w for w in s.split(" ") if w not in self.determiners])
+
+    def apply(self, resps, docs):
+
+        filtered_resps = []
+        for prediction, reference in zip(*(resps, docs["span1_text"])):
+
+            prediction = self.clean(prediction[0])
+            reference = self.clean(reference)
+
+            if ("'" in prediction) != ("'" in reference):
+                # referent is "Bob's hat" as predicting the referent.
+                predicted_referent = False
+            else:
+                prediction_words = set(prediction.split(" "))
+                referent_words = set(reference.split(" "))
+
+                # Handle cases where the prediction is "fuzzy bunny" and the referent is
+                # "bunny".
+                predicted_referent = prediction_words.issubset(
+                    referent_words
+                ) or referent_words.issubset(prediction_words)
+
+            filtered_resps.append(predicted_referent)
+
+        return filtered_resps
--- a/lm_eval/tasks/wsc273/README.md
+++ b/lm_eval/tasks/wsc273/README.md
+# WSC273
+
+### Paper
+
+Title: `The Winograd Schema Challenge`
+
+Abstract: http://commonsensereasoning.org/2011/papers/Levesque.pdf
+
+A Winograd schema is a pair of sentences that differ in only one or two words
+and that contain an ambiguity that is resolved in opposite ways in the two
+sentences and requires the use of world knowledge and reasoning for its resolution.
+The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
+
+NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
+as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.0
+
+Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
+
+
+### Citation
+
+```
+@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
+    title = "The winograd schema challenge",
+    abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
+    author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
+    year = "2012",
+    language = "English (US)",
+    isbn = "9781577355601",
+    series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
+    publisher = "Institute of Electrical and Electronics Engineers Inc.",
+    pages = "552--561",
+    booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
+    note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of any group yet.
+
+#### Tasks
+
+* `wsc273`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/wsc273/default.yaml
+++ b/lm_eval/tasks/wsc273/default.yaml
+task: wsc273
+dataset_path: winograd_wsc
+dataset_name: wsc273
+output_type: multiple_choice
+test_split: test
+doc_to_text: label
+process_docs: !function utils.process_doc
+doc_to_target: "{% set index = pronoun_loc + pronoun | length %}{{text[index:]}}"
+doc_to_choice: "{% set template = text[:pronoun_loc] %}{{[template+options[0], template+options[1]]}}"
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/wsc273/utils.py
+++ b/lm_eval/tasks/wsc273/utils.py
+upper_pronouns = [
+    "A",
+    "An",
+    "The",
+    "She",
+    "He",
+    "It",
+    "They",
+    "My",
+    "His",
+    "Her",
+    "Their",
+]
+
+
+def process_doc(dataset):
+    def process_fn(doc):
+        # The HF implementation of `wsc273` is not `partial evaluation` friendly.
+        doc["text"] = doc["text"].replace("  ", " ")
+        doc["options"][0] = __normalize_option(doc, doc["options"][0])
+        doc["options"][1] = __normalize_option(doc, doc["options"][1])
+        return doc
+
+    return dataset.map(process_fn)
+
+
+def __normalize_option(doc, option):
+    # Append `'s` to possessive determiner based options.
+    if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
+        option += "'s"
+    # Appropriately lowercase the pronoun in the option.
+    pronoun = option.split()[0]
+    start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
+    if not start_of_sentence and pronoun in upper_pronouns:
+        return option.replace(pronoun, pronoun.lower())
+    return option
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -10,7 +10,7 @@ import collections
 import importlib.util
 import fnmatch

-from typing import List, Literal, Union
+from typing import Iterator, List, Literal, Union

 import gc
 import torch
@@ -65,7 +65,7 @@ def join_iters(iters):
        yield from iter


-def chunks(iter, n=0, fn=None):
+def chunks(iter, n: int = 0, fn=None):
    arr = []
    for i, x in enumerate(iter):
        arr.append(x)
@@ -87,11 +87,11 @@ def group(arr, fn):


 class MultiChoice:
-    def __init__(self, choices):
+    def __init__(self, choices) -> None:
        self.choices = choices

    # Simple wildcard support (linux filename patterns)
-    def __contains__(self, values):
+    def __contains__(self, values) -> bool:
        for value in values.split(","):
            if len(fnmatch.filter(self.choices, value)) == 0:
                eval_logger.info(f"Available tasks to choose:")
@@ -100,7 +100,7 @@ class MultiChoice:
                raise ValueError("'{}' is not in task list".format(value))
        return True

-    def __iter__(self):
+    def __iter__(self) -> Iterator:
        for choice in self.choices:
            yield choice

@@ -108,7 +108,6 @@ class MultiChoice:
 # Returns a list containing all values of the source_list that
 # match at least one of the patterns
 def pattern_match(patterns, source_list):
-
    if type(patterns) == str:
        patterns = [patterns]

@@ -177,7 +176,7 @@ def make_disjoint_window(pair):


 class Reorderer:
-    def __init__(self, arr, fn):
+    def __init__(self, arr, fn) -> None:
        self.size = len(arr)
        arr = list(enumerate(arr))
        arr = group(arr, lambda x: fn(x[1]))
@@ -212,7 +211,7 @@ class Grouper:
    objects in `arr` satisfying `key == fn(ob)`.
    """

-    def __init__(self, arr, fn):
+    def __init__(self, arr, fn) -> None:
        # self.orig_arr = arr
        self.size = len(arr)
        arr = list(enumerate(arr))
@@ -263,7 +262,7 @@ class Grouper:
        return res


-def make_table(result_dict, column="results"):
+def make_table(result_dict, column: str = "results"):
    """Generate table of results."""
    from pytablewriter import MarkdownTableWriter, LatexTableWriter

@@ -393,7 +392,6 @@ def get_git_commit_hash():


 def import_function(loader, node):
-
    function_name = loader.construct_scalar(node)
    yaml_path = os.path.dirname(loader.name)

@@ -428,7 +426,6 @@ def load_yaml_config(yaml_path):
            include_path.reverse()
            final_yaml_config = {}
            for path in include_path:
-
                # Assumes that path is a full path.
                # If not found, assume the included yaml
                # is in the same dir as the original yaml
@@ -447,7 +444,7 @@ def load_yaml_config(yaml_path):
        return yaml_config


-def regex_replace(string, pattern, repl, count=0):
+def regex_replace(string, pattern, repl, count: int = 0):
    """Implements the `re.sub` function as a custom Jinja filter."""
    return re.sub(pattern, repl, string, count=count)

@@ -521,7 +518,7 @@ def pad_and_concat(
    return torch.cat(tensors, dim=0)


-def clear_torch_cache():
+def clear_torch_cache() -> None:
    gc.collect()
    torch.cuda.empty_cache()

@@ -546,7 +543,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
        tokenizer: transformers.PreTrainedTokenizer,
        initial_decoder_input_length: int,
        batch_size: int,
-    ):
+    ) -> None:
        self.initial_decoder_input_length = initial_decoder_input_length
        self.done_tracker = [False] * batch_size
        self.sequence = sequence

--- a/main.py
+++ b/main.py
@@ -11,11 +11,12 @@ from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger, SPACING
 from lm_eval.tasks import include_task_folder
+from lm_eval.benchmarks import include_benchmarks

 os.environ["TOKENIZERS_PARALLELISM"] = "false"


-def parse_args():
+def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
    parser.add_argument(
@@ -100,7 +101,7 @@ def parse_args():
    return parser.parse_args()


-def main():
+def main() -> None:
    args = parse_args()

    if args.limit:

--- a/mypy.ini
+++ b/mypy.ini
+[mypy]
+python_version = 3.9
+show_traceback = True
+check_untyped_defs = True
+no_implicit_reexport = True
+warn_unreachable = True
+warn_unused_configs = True
+warn_unused_ignores = True
+warn_redundant_casts = True
+
+# We ignore errors everywhere to gradually add type annotations
+
+[mypy-lm_eval.*]
+ignore_errors = True
+
+[mypy-lm_eval.api.*]
+ignore_errors = True
+
+[mypy-lm_eval.prompts.*]
+ignore_errors = True
+
+[mypy-lm_eval.models.*]
+ignore_errors = True
+
+[mypy-scripts.*]
+ignore_errors = True
+
+[mypy-main]
+ignore_errors = True
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@ setuptools.setup(
    ],
    python_requires=">=3.9",
    install_requires=[
-        "accelerate>=0.18.0",
+        "accelerate>=0.21.0",
        "evaluate",
        "datasets>=2.0.0",
        "evaluate>=0.4.0",