first add

ed53d51c · Rayyyyy · ed53d51c · ed53d51c · ed53d51c · ed53d51c
Commit ed53d51c authored Apr 27, 2024 by Rayyyyy
10 changed files
--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
+"""
+Semantic Parsing on Freebase from Question-Answer Pairs
+https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf
+
+WebQuestions is a benchmark for question answering. The dataset consists of 6,642
+question/answer pairs. The questions are supposed to be answerable by Freebase, a
+large knowledge graph. The questions are mostly centered around a single named entity.
+The questions are popular ones asked on the web (at least in 2013).
+
+Homepage: https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a
+"""
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@inproceedings{berant-etal-2013-semantic,
+    title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
+    author = "Berant, Jonathan  and
+      Chou, Andrew  and
+      Frostig, Roy  and
+      Liang, Percy",
+    booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
+    month = oct,
+    year = "2013",
+    address = "Seattle, Washington, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D13-1160",
+    pages = "1533--1544",
+}
+"""
+
+
+class WebQs(Task):
+    VERSION = 0
+    DATASET_PATH = "web_questions"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        return "Question: " + doc["question"] + "\nAnswer:"
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["question"]
+
+    def doc_to_target(self, doc):
+        # this picks one answer to be the "correct" one, despite sometimes
+        # multiple correct answers being possible.
+        # TODO: make sure we're actually handling multi-answer correctly
+        return " " + doc["answers"][0]
+
+    def _remove_prefixes(self, aliases):
+        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
+        # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
+        aliases.sort()
+        ret = [aliases[0]]
+        for alias in aliases[1:]:
+            if not alias.startswith(ret[-1]):
+                ret.append(alias)
+
+        return ret
+
+    def construct_requests(self, doc, ctx):
+        ret = []
+        for alias in self._remove_prefixes(doc["answers"]):
+            _, is_prediction = rf.loglikelihood(ctx, " " + alias)
+            ret.append(is_prediction)
+        return ret
+
+    def process_results(self, doc, results):
+        return {"acc": float(any(results))}
+
+    def aggregation(self):
+        return {
+            "acc": mean,
+        }
+
+    def higher_is_better(self):
+        return {"acc": True}
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
+"""
+Pointer Sentinel Mixture Models
+https://arxiv.org/pdf/1609.07843.pdf
+
+The WikiText language modeling dataset is a collection of over 100 million tokens
+extracted from the set of verified Good and Featured articles on Wikipedia.
+
+NOTE: This `Task` is based on WikiText-2.
+
+Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
+"""
+import re
+from lm_eval.base import PerplexityTask
+
+
+_CITATION = """
+@misc{merity2016pointer,
+    title={Pointer Sentinel Mixture Models},
+    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
+    year={2016},
+    eprint={1609.07843},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+
+def wikitext_detokenizer(string):
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(" @-@ ", "-")
+    string = string.replace(" @,@ ", ",")
+    string = string.replace(" @.@ ", ".")
+    # punctuation
+    string = string.replace(" : ", ": ")
+    string = string.replace(" ; ", "; ")
+    string = string.replace(" . ", ". ")
+    string = string.replace(" ! ", "! ")
+    string = string.replace(" ? ", "? ")
+    string = string.replace(" , ", ", ")
+    # double brackets
+    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace("= = = =", "====")
+    string = string.replace("= = =", "===")
+    string = string.replace("= =", "==")
+    string = string.replace(" " + chr(176) + " ", chr(176))
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" N ", " 1 ")
+    string = string.replace(" 's", "'s")
+
+    return string
+
+
+class WikiText(PerplexityTask):
+    VERSION = 1
+    DATASET_PATH = "EleutherAI/wikitext_document_level"
+    DATASET_NAME = "wikitext-2-raw-v1"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return map(self._process_doc, self.dataset["train"])
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+
+    def _process_doc(self, doc):
+        return doc["page"]
+
+    def doc_to_target(self, doc):
+        return wikitext_detokenizer(doc)
+
+    def should_decontaminate(self):
+        return True
+
+    def count_words(self, doc):
+        # count number of words in *original doc before detokenization*
+        return len(re.split(r"\s+", doc))
--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
+"""
+WinoGrande: An Adversarial Winograd Schema Challenge at Scale
+https://arxiv.org/pdf/1907.10641.pdf
+
+WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
+(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
+robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
+task with binary options, the goal is to choose the right option for a given
+sentence which requires commonsense reasoning.
+
+NOTE: This evaluation of Winogrande uses partial evaluation as described by
+Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.02847
+
+Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
+"""
+import numpy as np
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@article{sakaguchi2019winogrande,
+    title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
+    author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
+    journal={arXiv preprint arXiv:1907.10641},
+    year={2019}
+}
+"""
+
+
+class Winogrande(Task):
+    VERSION = 0
+    DATASET_PATH = "winogrande"
+    DATASET_NAME = "winogrande_xl"
+
+    answer_to_num = {"1": 0, "2": 1}
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def doc_to_text(self, doc):
+        return self.partial_context(doc, doc["option" + doc["answer"]])
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["sentence"]
+
+    @classmethod
+    def partial_context(cls, doc, option):
+        # Substitute the pronoun in the sentence with the specified option
+        # and ignore everything after.
+        pronoun_loc = doc["sentence"].index("_")
+        return doc["sentence"][:pronoun_loc] + option
+
+    def doc_to_target(self, doc):
+        return self.partial_target(doc)
+
+    @classmethod
+    def partial_target(cls, doc):
+        # The target is everything after the document specified pronoun.
+        pronoun_loc = doc["sentence"].index("_") + 1
+        return " " + doc["sentence"][pronoun_loc:].strip()
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        target = self.partial_target(doc)
+        lls = []
+        for option in [doc["option1"], doc["option2"]]:
+            partial_ctx = self.partial_context(doc, option)
+            full_ctx = self.append_context(ctx, partial_ctx)
+            lls.append(rf.loglikelihood(full_ctx, target)[0])
+        return lls
+
+    @classmethod
+    def append_context(cls, ctx, partial_ctx):
+        ctx = ctx.split("\n\n")  # Each fewshot context is on its own new line.
+        ctx.pop()  # Remove the correct context put in by `doc_to_text`.
+        return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        return {"acc": np.argmax(results) == self.answer_to_num[doc["answer"]]}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {"acc": mean}
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {"acc": True}
--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
+"""
+The Winograd Schema Challenge
+http://commonsensereasoning.org/2011/papers/Levesque.pdf
+
+A Winograd schema is a pair of sentences that differ in only one or two words
+and that contain an ambiguity that is resolved in opposite ways in the two
+sentences and requires the use of world knowledge and reasoning for its resolution.
+The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
+
+NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
+as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.0
+
+Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
+"""
+import numpy as np
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
+    title = "The winograd schema challenge",
+    abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
+    author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
+    year = "2012",
+    language = "English (US)",
+    isbn = "9781577355601",
+    series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
+    publisher = "Institute of Electrical and Electronics Engineers Inc.",
+    pages = "552--561",
+    booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
+    note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
+}
+"""
+
+
+class WinogradSchemaChallenge273(Task):
+    VERSION = 0
+    DATASET_PATH = "winograd_wsc"
+    DATASET_NAME = "wsc273"
+
+    upper_pronouns = [
+        "A",
+        "An",
+        "The",
+        "She",
+        "He",
+        "It",
+        "They",
+        "My",
+        "His",
+        "Her",
+        "Their",
+    ]
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+
+    def _process_doc(self, doc):
+        # The HF implementation of `wsc273` is not `partial evaluation` friendly.
+        doc["text"] = doc["text"].replace("  ", " ")
+        doc["options"][0] = self.__normalize_option(doc, doc["options"][0])
+        doc["options"][1] = self.__normalize_option(doc, doc["options"][1])
+        return doc
+
+    def __normalize_option(self, doc, option):
+        # Append `'s` to possessive determiner based options.
+        if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
+            option += "'s"
+        # Appropriately lowercase the pronoun in the option.
+        pronoun = option.split()[0]
+        start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
+        if not start_of_sentence and pronoun in self.upper_pronouns:
+            return option.replace(pronoun, pronoun.lower())
+        return option
+
+    def fewshot_examples(self, k, rnd):
+        # NOTE: `super().fewshot_examples` samples from training docs which are
+        # not available for this test-set-only dataset.
+
+        if self._fewshot_docs is None:
+            self._fewshot_docs = list(self.test_docs())
+
+        return rnd.sample(list(self._fewshot_docs), k)
+
+    def doc_to_text(self, doc):
+        return self.partial_context(doc, doc["options"][doc["label"]])
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["text"]
+
+    @classmethod
+    def partial_context(cls, doc, option):
+        # Substitute the pronoun in the original text with the specified
+        # option and ignore everything after.
+        return doc["text"][: doc["pronoun_loc"]] + option
+
+    def doc_to_target(self, doc):
+        return self.partial_target(doc)
+
+    @classmethod
+    def partial_target(cls, doc):
+        # The target is everything after the document specified pronoun.
+        start_index = doc["pronoun_loc"] + len(doc["pronoun"])
+        return " " + doc["text"][start_index:].strip()
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        target = self.partial_target(doc)
+        lls = []
+        for option in doc["options"]:
+            partial_ctx = self.partial_context(doc, option)
+            full_ctx = self.append_context(ctx, partial_ctx)
+            lls.append(rf.loglikelihood(full_ctx, target)[0])
+        return lls
+
+    @classmethod
+    def append_context(cls, ctx, partial_ctx):
+        ctx = ctx.split("\n\n")  # Each fewshot context is on its own new line.
+        ctx.pop()  # Remove the correct context put in by `doc_to_text`.
+        return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        return {"acc": np.argmax(results) == doc["label"]}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {"acc": mean}
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {"acc": True}
--- a/lm_eval/tasks/xcopa.py
+++ b/lm_eval/tasks/xcopa.py
+"""
+XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning
+https://ducdauge.github.io/files/xcopa.pdf
+
+The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages.
+The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe.
+The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages.
+All the details about the creation of XCOPA and the implementation of the baselines are available in the paper.
+
+Homepage: https://github.com/cambridgeltl/xcopa
+"""
+from .superglue import Copa
+
+
+_CITATION = """
+@inproceedings{ponti2020xcopa,
+  title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
+  author={Edoardo M. Ponti, Goran Glava\v{s}, Olga Majewska, Qianchu Liu, Ivan Vuli\'{c} and Anna Korhonen},
+  booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+  year={2020},
+  url={https://ducdauge.github.io/files/xcopa.pdf}
+}
+"""
+
+
+class XCopa(Copa):
+    VERSION = 0
+    DATASET_PATH = "xcopa"
+    DATASET_NAME = None
+    CAUSE = "because"
+    EFFECT = "therefore"
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        # Drop the period
+        connector = {
+            "cause": self.CAUSE,
+            "effect": self.EFFECT,
+        }[doc["question"]]
+        return doc["premise"].strip()[:-1] + f" {connector}"
+
+
+class XCopaEt(XCopa):
+    DATASET_NAME = "et"
+    CAUSE = "sest"
+    EFFECT = "seetõttu"
+
+
+class XCopaHt(XCopa):
+    DATASET_NAME = "ht"
+    CAUSE = "poukisa"
+    EFFECT = "donk sa"
+
+
+class XCopaIt(XCopa):
+    DATASET_NAME = "it"
+    CAUSE = "perché"
+    EFFECT = "quindi"
+
+
+class XCopaId(XCopa):
+    DATASET_NAME = "id"
+    CAUSE = "karena"
+    EFFECT = "maka"
+
+
+class XCopaQu(XCopa):
+    DATASET_NAME = "qu"
+    CAUSE = "imataq"
+    EFFECT = "chaymi"
+
+
+class XCopaSw(XCopa):
+    DATASET_NAME = "sw"
+    CAUSE = "kwa sababu"
+    EFFECT = "kwa hiyo"
+
+
+class XCopaZh(XCopa):
+    DATASET_NAME = "zh"
+    CAUSE = "因为"
+    EFFECT = "所以"
+
+
+class XCopaTa(XCopa):
+    DATASET_NAME = "ta"
+    CAUSE = "காரணமாக"
+    EFFECT = "எனவே"
+
+
+class XCopaTh(XCopa):
+    DATASET_NAME = "th"
+    CAUSE = "เพราะ"
+    EFFECT = "ดังนั้น"
+
+
+class XCopaTr(XCopa):
+    DATASET_NAME = "tr"
+    CAUSE = "çünkü"
+    EFFECT = "bu yüzden"
+
+
+class XCopaVi(XCopa):
+    DATASET_NAME = "vi"
+    CAUSE = "bởi vì"
+    EFFECT = "vì vậy"
+
+
+LANGS = ["et", "ht", "it", "id", "qu", "sw", "zh", "ta", "th", "tr", "vi"]
+
+LANG_CLASSES = [
+    XCopaEt,
+    XCopaHt,
+    XCopaIt,
+    XCopaId,
+    XCopaQu,
+    XCopaSw,
+    XCopaZh,
+    XCopaTa,
+    XCopaTh,
+    XCopaTr,
+    XCopaVi,
+]
+
+
+def construct_tasks():
+    tasks = {}
+    for lang, lang_class in zip(LANGS, LANG_CLASSES):
+        tasks[f"xcopa_{lang}"] = lang_class
+    return tasks
--- a/lm_eval/tasks/xnli.py
+++ b/lm_eval/tasks/xnli.py
+"""
+XNLI: Evaluating Cross-lingual Sentence Representations
+https://arxiv.org/abs/1809.05053
+
+Based on the implementation of @yongzx (see https://github.com/EleutherAI/lm-evaluation-harness/pull/258)
+
+Prompt format (same as XGLM and mGPT):
+
+sentence1 + ", right? " + mask = (Yes|Also|No) + ", " + sentence2
+
+Predicition is the full sequence with the highest likelihood.
+
+Language specific prompts are translated word-by-word with Google Translate
+and may differ from the ones used by mGPT and XGLM (they do not provide their prompts).
+
+Homepage: https://github.com/facebookresearch/XNLI
+"""
+import numpy as np
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+from lm_eval import utils
+
+_CITATIONS = """
+@InProceedings{conneau2018xnli,
+  author = "Conneau, Alexis
+        and Rinott, Ruty
+        and Lample, Guillaume
+        and Williams, Adina
+        and Bowman, Samuel R.
+        and Schwenk, Holger
+        and Stoyanov, Veselin",
+  title = "XNLI: Evaluating Cross-lingual Sentence Representations",
+  booktitle = "Proceedings of the 2018 Conference on Empirical Methods
+               in Natural Language Processing",
+  year = "2018",
+  publisher = "Association for Computational Linguistics",
+  location = "Brussels, Belgium",
+}
+"""
+
+
+class XNLIBase(Task):
+    VERSION = 0
+    DATASET_PATH = "xnli"
+    DATASET_NAME = None
+
+    QUESTION_WORD = None  # 'right'
+    ENTAILMENT_LABEL = None  # 'Yes'
+    NEUTRAL_LABEL = None  # 'Also'
+    CONTRADICTION_LABEL = None  # 'No'
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        # Example:
+        # The girl that can help me is all the way across town, right? Yes, The girl I need help from lives a ways away.
+        # [MASK] is replaced with ENTAILMENT_LABEL, NEUTRAL_LABEL, or CONTRADICTION_LABEL
+        return (
+            doc["premise"]
+            + ", "
+            + self.QUESTION_WORD
+            + "? [MASK], "
+            + doc["hypothesis"]
+        )
+
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return (
+            " "
+            + [self.ENTAILMENT_LABEL, self.NEUTRAL_LABEL, self.CONTRADICTION_LABEL][
+                doc["label"]
+            ]
+        )
+
+    def doc_to_fewshot_prompt(self, doc):
+
+        prompt = self.doc_to_text(doc)
+        return prompt.replace("[MASK]", self.doc_to_target(doc)[1:])
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        ll_true = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.ENTAILMENT_LABEL))
+        ll_neither = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.NEUTRAL_LABEL))
+        ll_false = rf.loglikelihood_rolling(
+            ctx.replace("[MASK]", self.CONTRADICTION_LABEL)
+        )
+
+        return ll_true, ll_neither, ll_false
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        gold = doc["label"]
+        pred = np.argmax(results)
+        return {"acc": pred == gold}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {"acc": mean}
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {"acc": True}
+
+    @utils.positional_deprecated
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param provide_description: bool
+            Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
+        :param rnd: random.Random
+            The pseudo-random number generator used to randomly sample examples.
+            WARNING: This is currently a required arg although it's optionalized with a default `None`.
+        :param description: str
+            The task's description that will be prepended to the fewshot examples.
+        :returns: str
+            The fewshot context.
+        """
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`"
+        assert not provide_description, (
+            "The `provide_description` arg will be removed in future versions. To prepend "
+            "a custom description to the context, supply the corresponding string via the "
+            "`description` arg."
+        )
+        if provide_description is not None:
+            # nudge people to not specify it at all
+            print(
+                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+            )
+
+        description = description + "\n\n" if description else ""
+
+        if num_fewshot == 0:
+            labeled_examples = ""
+        else:
+            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
+            if self.has_training_docs():
+                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            else:
+                if self._fewshot_docs is None:
+                    self._fewshot_docs = list(
+                        self.validation_docs()
+                        if self.has_validation_docs()
+                        else self.test_docs()
+                    )
+
+                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+
+                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+
+            labeled_examples = (
+                "\n\n".join(
+                    [
+                        # self.doc_to_text(doc) + self.doc_to_target(doc)
+                        self.doc_to_fewshot_prompt(doc)
+                        for doc in fewshotex
+                    ]
+                )
+                + "\n\n"
+            )
+
+        example = self.doc_to_text(doc)
+        return description + labeled_examples + example
+
+
+class XNLI_en(XNLIBase):  # English
+    DATASET_NAME = "en"
+
+    QUESTION_WORD = "right"
+    ENTAILMENT_LABEL = "Yes"
+    NEUTRAL_LABEL = "Also"
+    CONTRADICTION_LABEL = "No"
+
+
+class XNLI_de(XNLIBase):  # German
+    DATASET_NAME = "de"
+
+    QUESTION_WORD = "richtig"
+    ENTAILMENT_LABEL = "Ja"
+    NEUTRAL_LABEL = "Auch"
+    CONTRADICTION_LABEL = "Nein"
+
+
+class XNLI_ar(XNLIBase):  # Arabic
+    DATASET_NAME = "ar"
+
+    QUESTION_WORD = "صحيح"
+    ENTAILMENT_LABEL = "نعم"
+    NEUTRAL_LABEL = "لذا"
+    CONTRADICTION_LABEL = "رقم"
+
+
+class XNLI_bg(XNLIBase):  # Bulgarian
+    DATASET_NAME = "bg"
+
+    QUESTION_WORD = "правилно"
+    ENTAILMENT_LABEL = "да"
+    NEUTRAL_LABEL = "така"
+    CONTRADICTION_LABEL = "не"
+
+
+class XNLI_el(XNLIBase):  # Greek
+    DATASET_NAME = "el"
+
+    QUESTION_WORD = "σωστός"
+    ENTAILMENT_LABEL = "Ναί"
+    NEUTRAL_LABEL = "Έτσι"
+    CONTRADICTION_LABEL = "όχι"
+
+
+class XNLI_es(XNLIBase):  # Spanish
+    DATASET_NAME = "es"
+
+    QUESTION_WORD = "correcto"
+    ENTAILMENT_LABEL = "Sí"
+    NEUTRAL_LABEL = "Asi que"
+    CONTRADICTION_LABEL = "No"
+
+
+class XNLI_fr(XNLIBase):  # French
+    DATASET_NAME = "fr"
+
+    QUESTION_WORD = "correct"
+    ENTAILMENT_LABEL = "Oui"
+    NEUTRAL_LABEL = "Aussi"
+    CONTRADICTION_LABEL = "Non"
+
+
+class XNLI_hi(XNLIBase):  # Hindi
+    DATASET_NAME = "hi"
+
+    QUESTION_WORD = "सही"
+    ENTAILMENT_LABEL = "हाँ"
+    NEUTRAL_LABEL = "इसलिए"
+    CONTRADICTION_LABEL = "नहीं"
+
+
+class XNLI_ru(XNLIBase):  # Russian
+    DATASET_NAME = "ru"
+
+    QUESTION_WORD = "правильно"
+    ENTAILMENT_LABEL = "Да"
+    NEUTRAL_LABEL = "Так"
+    CONTRADICTION_LABEL = "Нет"
+
+
+class XNLI_sw(XNLIBase):  # Swahili
+    DATASET_NAME = "sw"
+
+    QUESTION_WORD = "sahihi"
+    ENTAILMENT_LABEL = "Ndiyo"
+    NEUTRAL_LABEL = "Hivyo"
+    CONTRADICTION_LABEL = "Hapana"
+
+
+class XNLI_th(XNLIBase):  # Thai
+    DATASET_NAME = "th"
+
+    QUESTION_WORD = "ถูกต้อง"
+    ENTAILMENT_LABEL = "ใช่"
+    NEUTRAL_LABEL = "ดังนั้น"
+    CONTRADICTION_LABEL = "ไม่"
+
+
+class XNLI_tr(XNLIBase):  # Turkish
+    DATASET_NAME = "tr"
+
+    QUESTION_WORD = "doğru"
+    ENTAILMENT_LABEL = "Evet"
+    NEUTRAL_LABEL = "Böylece"
+    CONTRADICTION_LABEL = "Hayır"
+
+
+class XNLI_ur(XNLIBase):  # Urdu
+    DATASET_NAME = "ur"
+
+    QUESTION_WORD = "صحیح"
+    ENTAILMENT_LABEL = "جی ہاں"
+    NEUTRAL_LABEL = "اس لئے"
+    CONTRADICTION_LABEL = "نہیں"
+
+
+class XNLI_vi(XNLIBase):  # Vietnamese
+    DATASET_NAME = "vi"
+
+    QUESTION_WORD = "đúng"
+    ENTAILMENT_LABEL = "Vâng"
+    NEUTRAL_LABEL = "Vì vậy"
+    CONTRADICTION_LABEL = "Không"
+
+
+class XNLI_zh(XNLIBase):  # Chinese
+    DATASET_NAME = "zh"
+
+    QUESTION_WORD = "正确"
+    ENTAILMENT_LABEL = "是的"
+    NEUTRAL_LABEL = "所以"
+    CONTRADICTION_LABEL = "不是的"
+
+
+LANGS = [
+    "ar",
+    "bg",
+    "de",
+    "el",
+    "en",
+    "es",
+    "fr",
+    "hi",
+    "ru",
+    "sw",
+    "th",
+    "tr",
+    "ur",
+    "vi",
+    "zh",
+]
+
+LANG_CLASSES = [
+    XNLI_ar,
+    XNLI_bg,
+    XNLI_de,
+    XNLI_el,
+    XNLI_en,
+    XNLI_es,
+    XNLI_fr,
+    XNLI_hi,
+    XNLI_ru,
+    XNLI_sw,
+    XNLI_th,
+    XNLI_tr,
+    XNLI_ur,
+    XNLI_vi,
+    XNLI_zh,
+]
+
+
+def construct_tasks():
+    tasks = {}
+    for lang, lang_class in zip(LANGS, LANG_CLASSES):
+        tasks[f"xnli_{lang}"] = lang_class
+    return tasks
--- a/lm_eval/tasks/xstorycloze.py
+++ b/lm_eval/tasks/xstorycloze.py
+"""
+Few-shot Learning with Multilingual Language Models
+https://arxiv.org/abs/2112.10668
+
+XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.
+Homepage: https://github.com/facebookresearch/fairseq/pull/4820
+"""
+from .storycloze import StoryCloze
+
+
+_CITATION = """
+@article{DBLP:journals/corr/abs-2112-10668,
+  author    = {Xi Victoria Lin and
+               Todor Mihaylov and
+               Mikel Artetxe and
+               Tianlu Wang and
+               Shuohui Chen and
+               Daniel Simig and
+               Myle Ott and
+               Naman Goyal and
+               Shruti Bhosale and
+               Jingfei Du and
+               Ramakanth Pasunuru and
+               Sam Shleifer and
+               Punit Singh Koura and
+               Vishrav Chaudhary and
+               Brian O'Horo and
+               Jeff Wang and
+               Luke Zettlemoyer and
+               Zornitsa Kozareva and
+               Mona T. Diab and
+               Veselin Stoyanov and
+               Xian Li},
+  title     = {Few-shot Learning with Multilingual Language Models},
+  journal   = {CoRR},
+  volume    = {abs/2112.10668},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2112.10668},
+  eprinttype = {arXiv},
+  eprint    = {2112.10668},
+  timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+"""
+
+_LANG = ["en", "ru", "zh", "es", "ar", "hi", "id", "te", "sw", "eu", "my"]
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of subjects
+    :return: {task_name: task}
+    """
+    return {f"xstory_cloze_{lang}": create_task(lang) for lang in _LANG}
+
+
+def create_task(lang):
+    class XStoryCloze(StoryCloze):
+        DATASET_PATH = "juletxara/xstory_cloze"
+        DATASET_NAME = lang
+
+        def __init__(self):
+            super().__init__(data_dir="")
+
+        def has_training_docs(self):
+            return True
+
+        def has_validation_docs(self):
+            return True
+
+        def has_test_docs(self):
+            return False
+
+        def training_docs(self):
+            return self.dataset["train"]
+
+        def validation_docs(self):
+            return self.dataset["eval"]
+
+        def test_docs(self):
+            pass
+
+    return XStoryCloze
--- a/lm_eval/tasks/xwinograd.py
+++ b/lm_eval/tasks/xwinograd.py
+"""
+It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning
+https://arxiv.org/abs/2106.12066
+
+Multilingual winograd schema challenge that includes English, French, Japanese, Portuguese, Russian and Chinese. Winograd schema challenges come from the XWinograd dataset introduced in Tikhonov et al. As it only contains 16 Chinese schemas, we add 488 Chinese schemas from clue/cluewsc2020.
+
+Homepage: https://huggingface.co/datasets/Muennighoff/xwinograd
+"""
+from .winogrande import Winogrande
+
+
+_CITATION = """
+@misc{muennighoff2022crosslingual,
+      title={Crosslingual Generalization through Multitask Finetuning},
+      author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
+      year={2022},
+      eprint={2211.01786},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{tikhonov2021heads,
+    title={It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning},
+    author={Alexey Tikhonov and Max Ryabinin},
+    year={2021},
+    eprint={2106.12066},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+_LANG = ["en", "fr", "jp", "pt", "ru", "zh"]
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of subjects
+    :return: {task_name: task}
+    """
+    return {f"xwinograd_{lang}": create_task(lang) for lang in _LANG}
+
+
+def create_task(lang):
+    class XWinograd(Winogrande):
+        DATASET_PATH = "Muennighoff/xwinograd"
+        DATASET_NAME = lang
+
+        def __init__(self):
+            super().__init__()
+
+        def has_training_docs(self):
+            return False
+
+        def has_validation_docs(self):
+            return False
+
+        def has_test_docs(self):
+            return True
+
+        def training_docs(self):
+            pass
+
+        def validation_docs(self):
+            pass
+
+        def test_docs(self):
+            return self.dataset["test"]
+
+    return XWinograd
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
+import os
+import pathlib
+import re
+import collections
+import functools
+import inspect
+import sys
+import fnmatch
+from typing import List, Union
+
+import gc
+import torch
+
+from omegaconf import OmegaConf
+
+
+class ExitCodeError(Exception):
+    pass
+
+
+def sh(x):
+    if os.system(x):
+        raise ExitCodeError()
+
+
+def escaped_split(text, sep_char, maxsplit=-1):
+    """Split text into a list on occurrences of the given separation
+    character `sep_char`. The separation character may be escaped by a
+    backslash to avoid splitting at that location.
+
+    The separation character must be a string of size 1.
+
+    If `maxsplit` is given, at most `maxsplit` splits are done (thus,
+    the list will have at most `maxsplit + 1` elements). If `maxsplit`
+    is not specified or less than 0, then there is no limit on the
+    number of splits (all possible splits are made).
+    """
+    assert (
+        len(sep_char) == 1
+    ), "separation string must be a single character for escaped splitting"
+
+    if maxsplit == 0:
+        return text
+    maxsplit = max(0, maxsplit)
+
+    return re.split(r"(?<!\\)" + sep_char, text, maxsplit)
+
+
+def simple_parse_args_string(args_string):
+    """
+    Parses something like
+        args1=val1,arg2=val2
+    Into a dictionary
+    """
+    args_string = args_string.strip()
+    if not args_string:
+        return {}
+    arg_list = args_string.split(",")
+    args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
+    return args_dict
+
+
+def join_iters(iters):
+    for iter in iters:
+        yield from iter
+
+
+def chunks(iter, n=0, fn=None):
+    arr = []
+    for i, x in enumerate(iter):
+        arr.append(x)
+        if len(arr) == (fn(i) if fn else n):
+            yield arr
+            arr = []
+
+    if arr:
+        yield arr
+
+
+def group(arr, fn):
+    res = collections.defaultdict(list)
+
+    for ob in arr:
+        res[fn(ob)].append(ob)
+
+    return list(res.values())
+
+
+def _is_json_task(task_name):
+    return task_name == "json" or task_name.startswith("json=")
+
+
+class MultiChoice:
+    def __init__(self, choices):
+        self.choices = choices
+
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values):
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0 and not _is_json_task(
+                value
+            ):
+                return False
+
+        return True
+
+    def __iter__(self):
+        for choice in self.choices:
+            yield choice
+
+
+# Returns a list containing all values of the source_list that
+# match at least one of the patterns
+def pattern_match(patterns, source_list):
+    task_names = set()
+    for pattern in patterns:
+        if _is_json_task(pattern):
+            task_names.add(pattern)
+
+        for matching in fnmatch.filter(source_list, pattern):
+            task_names.add(matching)
+    return sorted(list(task_names))
+
+
+def general_detokenize(string):
+    string = string.replace(" n't", "n't")
+    string = string.replace(" )", ")")
+    string = string.replace("( ", "(")
+    string = string.replace('" ', '"')
+    string = string.replace(' "', '"')
+    string = re.sub(r" (['.,])", r"\1", string)
+    return string
+
+
+def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
+    """
+    - context_len allows for a rolling window context, allowing each prediction window to potentially
+      condition on some context
+
+    :param token_list: list
+        List of tokens to be PREDICTED
+    :param max_seq_len: int
+        max_seq_len of model (or max_seq_len we want to use)
+    :param context_len: int
+        Amount of desired token context for prediction. Needs to be at least 1.
+    :param prefix_token: token
+        Dummy token like <eos> so the first token has something to condition on
+    :return: generator
+        Generator of tuples
+            (input_tokens, pred_tokens)
+        Note: Score only the last len(pred_tokens) logits of the LM
+    """
+    assert 1 <= context_len <= max_seq_len
+    if not token_list:
+        return
+    # +1 offset, going from input->preds
+    pred_len = max_seq_len - context_len + 1
+    predicted = 0
+
+    # Special handling for first window: predict all tokens
+    first_seq_len = min(max_seq_len, len(token_list))
+    yield ([prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len])
+    predicted += first_seq_len
+
+    while predicted < len(token_list):
+        window_pred_len = min(len(token_list) - predicted, pred_len)
+        window_end = predicted + window_pred_len
+
+        yield (
+            token_list[window_end - max_seq_len - 1 : window_end - 1],
+            token_list[window_end - window_pred_len : window_end],
+        )
+        predicted += window_pred_len
+
+
+def make_disjoint_window(pair):
+    """Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
+    a, b = pair
+    return a[: len(a) - (len(b) - 1)], b
+
+
+def select_continuation_from_batch_left_padding(
+    generations: Union[List[List[int]], torch.Tensor], max_context_size: int
+):
+    """Select the continuation from the batch, removing prompts of different lengths.
+    Args:
+        generations (Union[List[List[int]], torch.Tensor]):
+            A tensor or list-of-lists of shape [batch_size, sequence length].
+        max_context_size (int):
+            The size of the biggest context; generations will proceed from that
+            index.
+    Example:
+        PAD     PAD Continue : The dog chased the cat  [every       day of the week]
+        Riddle  me    this   : The  dog chased the  cat [yesterday] PAD PAD PAD PAD
+    Output:
+        [every day of the week]
+        [yesterday]  PAD PAD PAD PAD
+    """
+    return generations[:, max_context_size:]
+
+
+class Reorderer:
+    def __init__(self, arr, fn):
+        self.size = len(arr)
+        arr = list(enumerate(arr))
+        arr = group(arr, lambda x: fn(x[1]))
+        arr = [([y[0] for y in x], x[0][1]) for x in arr]
+        arr.sort(key=lambda x: fn(x[1]))
+
+        self.arr = arr
+
+    def get_reordered(self):
+        return [x[1] for x in self.arr]
+
+    def get_original(self, newarr):
+        res = [None] * self.size
+        cov = [False] * self.size
+
+        for (inds, _), v in zip(self.arr, newarr):
+            for ind in inds:
+                res[ind] = v
+                cov[ind] = True
+
+        assert all(cov)
+
+        return res
+
+
+def positional_deprecated(fn):
+    """
+    A decorator to nudge users into passing only keyword args (`kwargs`) to the
+    wrapped function, `fn`.
+    """
+
+    @functools.wraps(fn)
+    def _wrapper(*args, **kwargs):
+        if len(args) != 1 if inspect.ismethod(fn) else 0:
+            print(
+                f"WARNING: using {fn.__name__} with positional arguments is "
+                "deprecated and will be disallowed in a future version of "
+                "lm-evaluation-harness!"
+            )
+        return fn(*args, **kwargs)
+
+    return _wrapper
+
+
+@positional_deprecated
+def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
+    """
+    Search upward in the directory tree to a maximum of three layers
+    to find and return the package root (containing the 'tests' folder)
+    """
+    cur_path = start_path.resolve()
+    max_layers = 3
+    for _ in range(max_layers):
+        if (cur_path / "tests" / "test_version_stable.py").exists():
+            return cur_path
+        else:
+            cur_path = cur_path.parent.resolve()
+    raise FileNotFoundError(
+        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
+    )
+
+
+@positional_deprecated
+def run_task_tests(task_list: List[str]):
+    """
+    Find the package root and run the tests for the given tasks
+    """
+    import pytest
+
+    package_root = find_test_root(start_path=pathlib.Path(__file__))
+    task_string = " or ".join(task_list)
+    args = [
+        f"{package_root}/tests/test_version_stable.py",
+        f"--rootdir={package_root}",
+        "-k",
+        f"{task_string}",
+    ]
+    sys.path.append(str(package_root))
+    pytest_return_val = pytest.main(args)
+    if pytest_return_val:
+        raise ValueError(
+            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
+        )
+
+
+def clear_torch_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
--- a/main.py
+++ b/main.py
+import argparse
+import json
+import logging
+import os
+
+from lm_eval import tasks, evaluator, utils
+
+logging.getLogger("openai").setLevel(logging.WARNING)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True)
+    parser.add_argument("--model_args", default="")
+    parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS))
+    parser.add_argument("--provide_description", action="store_true")
+    parser.add_argument("--num_fewshot", type=int, default=0)
+    parser.add_argument("--batch_size", type=str, default=None)
+    parser.add_argument("--max_batch_size", type=int, default=None,
+                        help="Maximal batch size to try with --batch_size auto")
+    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument("--output_path", default=None)
+    parser.add_argument("--limit", type=float, default=None,
+                        help="Limit the number of examples per task. "
+                             "If <1, limit is a percentage of the total number of examples.")
+    parser.add_argument("--data_sampling", type=float, default=None)
+    parser.add_argument("--no_cache", action="store_true")
+    parser.add_argument("--decontamination_ngrams_path", default=None)
+    parser.add_argument("--description_dict_path", default=None)
+    parser.add_argument("--check_integrity", action="store_true")
+    parser.add_argument("--write_out", action="store_true", default=False)
+    parser.add_argument("--output_base_path", type=str, default=None)
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    assert not args.provide_description  # not implemented
+
+    if args.limit:
+        print(
+            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+
+    if args.tasks is None:
+        task_names = tasks.ALL_TASKS
+    else:
+        task_names = utils.pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
+
+    print(f"Selected Tasks: {task_names}")
+
+    description_dict = {}
+    if args.description_dict_path:
+        with open(args.description_dict_path, "r") as f:
+            description_dict = json.load(f)
+
+    results = evaluator.simple_evaluate(
+        model=args.model,
+        model_args=args.model_args,
+        tasks=task_names,
+        num_fewshot=args.num_fewshot,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        device=args.device,
+        no_cache=args.no_cache,
+        limit=args.limit,
+        description_dict=description_dict,
+        decontamination_ngrams_path=args.decontamination_ngrams_path,
+        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        output_base_path=args.output_base_path,
+    )
+
+    dumped = json.dumps(results, indent=2)
+    print(dumped)
+
+    if args.output_path:
+        os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
+        with open(args.output_path, "w") as f:
+            f.write(dumped)
+
+    batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
+    print(
+        f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
+        f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
+    )
+    print(evaluator.make_table(results))
+
+
+if __name__ == "__main__":
+    main()