conflict changed

7604b873 · cardy20 · 17b04444 · e8f38aee · 7604b873 · 7604b873
Commit 7604b873 authored Jun 03, 2023 by cardy20
20 changed files
--- a/lm_eval/tasks/klue.py
+++ b/lm_eval/tasks/klue.py
@@ -12,10 +12,13 @@ https://arxiv.org/abs/2105.09680
 Homepage: https://klue-benchmark.com/
 """

+import datasets
+from math import exp
 import numpy as np
 from lm_eval.base import Task, MultipleChoiceTask, rf
 from lm_eval.metrics import macro_f1_score, mean, matthews_corrcoef, f1_score, yesno
 from lm_eval.utils import general_detokenize
+from functools import partial

 _CITATION = """
 @misc{park2021klue,
@@ -29,6 +32,18 @@ _CITATION = """
 """


+def _squad_metric(predictions, references):
+    squad_metric = datasets.load_metric("squad_v2")
+
+    return squad_metric.compute(predictions=predictions, references=references)
+
+
+def _squad_agg(key, items):
+    predictions, references = zip(*items)
+
+    return _squad_metric(predictions=predictions, references=references)[key]
+
+
 class STS(Task):
    VERSION = 0
    DATASET_PATH = "klue"
@@ -106,7 +121,7 @@ class YNAT(MultipleChoiceTask):
        return self._training_docs

    def validation_docs(self):
-        return map(self._process_doc,self.dataset["validation"])
+        return map(self._process_doc, self.dataset["validation"])

    def _process_doc(self, doc):
        out_doc = {
@@ -170,9 +185,11 @@ class NLI(Task):
        )

    def doc_to_target(self, doc):
-        # 참 = entailment
-        # 거짓 = contradiction
-        # 무관 = neutral
+        """
+        참 = entailment
+        거짓 = contradiction
+        무관 = neutral
+        """
        return " {}".format({0: "참", 1: "중립", 2: "거짓"}[doc["label"]])

    def construct_requests(self, doc, ctx):
@@ -191,3 +208,156 @@ class NLI(Task):

    def aggregation(self):
        return {"acc": mean}
+
+
+class MRC(Task):
+    VERSION = 0
+    DATASET_PATH = "klue"
+    DATASET_NAME = "mrc"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def doc_to_text(self, doc):
+        return '제목: ' + doc['title'] + '\n\n' + '본문: ' + doc['context'] + '\n\n' + '질문: ' + doc['question'] + '\n\n' + '답:'
+
+    def doc_to_target(self, doc):
+        answer = doc["answers"]["text"][0]
+        if doc["is_impossible"]:
+            answer = "대답 불가"
+        return " " + answer
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        continuation = rf.greedy_until(ctx, ['\n'])
+        is_unanswerable = rf.loglikelihood(ctx, " " + "대답 불가")
+        return continuation, is_unanswerable
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        continuation, (logprob_unanswerable, _) = results
+
+        no_answer_probability = exp(logprob_unanswerable)
+        
+        predictions = {
+            'id': doc['guid'],
+            'prediction_text': continuation,
+            'no_answer_probability': no_answer_probability,
+        }
+
+        references = {
+            'id': doc['guid'],
+            'answers': doc['answers'],
+            'unanswerable': doc['is_impossible'],
+        }
+
+        return {
+            "exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": (
+                predictions,
+                references,
+            ),  # Best exact match (with varying threshold)
+            "best_f1": (predictions, references),  # Best F1 (with varying threshold)
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        return {
+            "exact": partial(
+                _squad_agg, "exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": partial(
+                _squad_agg, "f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": partial(
+                _squad_agg, "HasAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": partial(
+                _squad_agg, "HasAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": partial(
+                _squad_agg, "NoAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": partial(
+                _squad_agg, "NoAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": partial(
+                _squad_agg, "best_exact"
+            ),  # Best exact match (with varying threshold)
+            "best_f1": partial(
+                _squad_agg, "best_f1"
+            ),  # Best F1 (with varying threshold)
+        }
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        return {
+            "exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": True,  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "best_exact": True,  # Best exact match (with varying threshold)
+            "best_f1": True,  # Best F1 (with varying threshold)
+        }
--- a/lm_eval/tasks/kold.py
+++ b/lm_eval/tasks/kold.py
+"""
+KOLD
+https://arxiv.org/abs/2205.11315
+
+Homepage: https://github.com/boychaboy/KOLD 
+"""
+import inspect
+import lm_eval.datasets.kold.kold
+import numpy as np
+from lm_eval.base import MultipleChoiceTask, rf, Task
+from lm_eval.metrics import macro_f1_score, mean, f1_score
+
+def make_label(label: str):
+    """
+    0 = 없음
+    1 = 개인
+    2 = 단체
+    """
+    if label == "individual":
+        return 1
+    elif label == "group":
+        return 2
+    else:
+        return 0
+
+class KoldLevelA(Task):
+    VERSION = 0
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.kold.kold)
+    DATASET_NAME = "kolda"
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+
+    def _process_doc(self, doc):
+        def format_example(doc):
+            prompt = f"문장: {doc['comment']}\n"
+            prompt += f"질문: 문장이 공격적인가요?\n정답:"
+            return prompt
+        
+        return {
+            "prompt": format_example(doc),
+            "label": doc["off"]
+        }
+
+
+    def doc_to_text(self, doc):
+        return doc["prompt"]
+
+    def doc_to_target(self, doc):
+        return " {}".format({0: "아니오", 1: "예"}[doc["label"]])
+
+    def construct_requests(self, doc, ctx):
+        ll_no, _ = rf.loglikelihood(ctx, " 아니오")
+        ll_yes, _ = rf.loglikelihood(ctx, " 예")
+
+        return ll_no, ll_yes
+
+    def process_results(self, doc, results):
+        pred = np.argmax(results)
+        gold = doc["label"]
+        return {
+            "acc": pred == gold,
+            "f1": (gold, pred)
+        }
+
+    def higher_is_better(self):
+        return {
+            "acc": True,
+            "f1": True
+        }
+
+    def aggregation(self):
+        return {
+            "acc": mean,
+            "f1": f1_score
+        }
+
+
+
+class KoldLevelB(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.kold.kold)
+    DATASET_NAME = "koldb"
+
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+
+    def _process_doc(self, doc):
+        def format_example(doc, choices):
+            prompt = f"문장: {doc['comment']}\n"
+            prompt += "질문: 공격 대상이 "
+            prompt += "".join([f"{choice} "for choice in choices])
+            prompt += "중 무엇인가요?\n정답:"
+            return prompt
+
+        choices = ["없음", "개인", "단체"]
+        return {
+            "prompt": format_example(doc, choices),
+            "choices": choices,
+            "label": make_label(doc["tgt"])
+        }
+
+
+    def doc_to_text(self, doc):
+        return doc["prompt"]
+
+    def doc_to_target(self, doc):
+        return " {}".format({0: "없음", 1: "개인", 2:"단체"}[doc["label"]])
+
+    def process_results(self, doc, results):
+        pred = np.argmax(results)
+        gold = doc["label"]
+        return {
+            "f1": (gold, pred)
+        }
+
+    def higher_is_better(self):
+        return {
+            "f1": True
+        }
+
+    def aggregation(self):
+        return {
+            "f1": macro_f1_score
+        }
--- a/lm_eval/tasks/korunsmile.py
+++ b/lm_eval/tasks/korunsmile.py
+"""
+ Korean UnSmile Dataset
+ 
+ Github: https://github.com/smilegate-ai/korean_unsmile_dataset
+"""
+
+import numpy as np
+from lm_eval.base import MultipleChoiceTask
+from lm_eval.metrics import macro_f1_score
+
+_CITATION = """
+@misc{SmilegateAI2022KoreanUnSmileDataset,
+  title         = {Korean UnSmile dataset: Human-annotated Multi-label Korean Hate Speech Dataset},
+  author        = {Seonghyun Kim},
+  year          = {2022},
+  howpublished  = {https://github.com/smilegate-ai/korean_unsmile_dataset},
+}
+"""
+
+def multilable_to_multiclass(label: list):
+    """
+    0 = 혐오
+    1 = 악플
+    2 = 양호
+    """
+    assert type(label[0]) == int
+    _id = np.argmax(label)
+
+    if _id == 8:
+        return 1
+    elif _id == 9:
+        return 2
+    else:
+        return 0
+
+
+class KorUnSmile(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "smilegate-ai/kor_unsmile"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc,self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc,self.dataset["valid"])
+
+    def _process_doc(self, doc):
+        out_doc = {
+            "title": doc["문장"],
+            "choices": ["혐오", "악플", "양호"],
+            "gold": multilable_to_multiclass(doc["labels"])
+        }
+        return out_doc
+
+    def doc_to_text(self, doc):
+        return "{}".format(doc["title"])
+
+    def doc_to_target(self, doc):
+        return " {}".format({0: "혐오", 1: "악플", 2: "양호"}[doc["gold"]])
+
+    def process_results(self, doc, results):
+        pred = np.argmax(results)
+        gold = doc["gold"]
+        return {
+            "f1": (gold, pred)
+        }
+
+    def higher_is_better(self):
+        return {
+            "f1": True
+        }
+
+    def aggregation(self):
+        return {
+            "f1": macro_f1_score
+        }
--- a/lm_eval/tasks/mgsm.py
+++ b/lm_eval/tasks/mgsm.py
+"""
+Language Models are Multilingual Chain-of-Thought Reasoners
+https://arxiv.org/abs/2210.03057
+
+Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems, proposed in the paper [Language models are multilingual chain-of-thought reasoners](http://arxiv.org/abs/2210.03057).
+
+The same 250 problems from [GSM8K](https://arxiv.org/abs/2110.14168) are each translated via human annotators in 10 languages. The 10 languages are:
+- Spanish
+- French
+- German
+- Russian
+- Chinese
+- Japanese
+- Thai
+- Swahili
+- Bengali
+- Telugu
+
+GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality linguistically diverse grade school math word problems. The dataset was created to support the task of question answering on basic mathematical problems that require multi-step reasoning.
+
+You can find the input and targets for each of the ten languages (and English) as `.tsv` files.
+We also include few-shot exemplars that are also manually translated from each language in `exemplars.py`.
+
+Homepage: https://github.com/google-research/url-nlp/tree/main/mgsm
+"""
+import re
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@misc{cobbe2021training,
+    title={Training Verifiers to Solve Math Word Problems},
+    author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
+    year={2021},
+    eprint={2110.14168},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}
+@misc{shi2022language,
+    title={Language Models are Multilingual Chain-of-Thought Reasoners},
+    author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
+    year={2022},
+    eprint={2210.03057},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+ANS_RE = re.compile(r"(\-?\d+)")
+INVALID_ANS = "[invalid]"
+
+
+class MGSM(Task):
+    VERSION = 0
+    DATASET_PATH = "juletxara/mgsm"
+    DATASET_NAME = None
+    QUESTION = "Question:"
+    ANSWER = "Step-by-Step Answer:"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        raise NotImplementedError
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        if doc["answer"] is not None:
+            return doc["question"] + "\n" + self.ANSWER
+        else:
+            return self.QUESTION + " " + doc["question"] + "\n" + self.ANSWER
+
+    def doc_to_target(self, doc):
+        if doc["answer"] is not None:
+            return " " + doc["answer"][len(self.ANSWER) + 1 :]
+        else:
+            return " " + str(doc["answer_number"])
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        completion = rf.greedy_until(ctx, {"until": ["\n", ":", self.QUESTION]})
+        return completion
+
+    def _extract_answer(self, completion):
+        match = re.findall(ANS_RE, completion)
+        if match:
+            return int(match[-1])
+        else:
+            return INVALID_ANS
+
+    def _is_correct(self, completion, answer):
+        gold = answer
+        assert gold != INVALID_ANS, "No ground truth answer found in the document."
+        return self._extract_answer(completion) == gold
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        completion = results[0]
+        answer = doc["answer_number"]
+        return {"acc": self._is_correct(completion, answer)}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {"acc": mean}
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {"acc": True}
+
+
+class MGSM_English(MGSM):
+    DATASET_NAME = "en"
+    QUESTION = "Question:"
+    ANSWER = "Step-by-Step Answer:"
+
+
+class MGSM_Spanish(MGSM):
+    DATASET_NAME = "es"
+    QUESTION = "Pregunta:"
+    ANSWER = "Respuesta paso a paso:"
+
+
+class MGSM_French(MGSM):
+    DATASET_NAME = "fr"
+    QUESTION = "Question :"
+    ANSWER = "R\u00e9ponse \u00e9tape par \u00e9tape :"
+
+
+class MGSM_German(MGSM):
+    DATASET_NAME = "de"
+    QUESTION = "Frage:"
+    ANSWER = "Schritt-f\u00fcr-Schritt-Antwort:"
+
+
+class MGSM_Russian(MGSM):
+    DATASET_NAME = "ru"
+    QUESTION = "\u0417\u0430\u0434\u0430\u0447\u0430:"
+    ANSWER = "\u041f\u043e\u0448\u0430\u0433\u043e\u0432\u043e\u0435\u0440\u0435\u0448\u0435\u043d\u0438\u0435:"
+
+
+class MGSM_Chinese(MGSM):
+    DATASET_NAME = "zh"
+    QUESTION = "\u95ee\u9898:"
+    ANSWER = "\u9010\u6b65\u89e3\u7b54:"
+
+
+class MGSM_Japanese(MGSM):
+    DATASET_NAME = "ja"
+    QUESTION = "\u554f\u984c:"
+    ANSWER = "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:"
+
+
+class MGSM_Thai(MGSM):
+    DATASET_NAME = "th"
+    QUESTION = "\u0e42\u0e08\u0e17\u0e22\u0e4c:"
+    ANSWER = "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e17\u0e35\u0e25\u0e30\u0e02\u0e31\u0e49\u0e19\u0e15\u0e2d\u0e19:"
+
+
+class MGSM_Swahili(MGSM):
+    DATASET_NAME = "sw"
+    QUESTION = "Swali:"
+    ANSWER = "Jibu la Hatua kwa Hatua:"
+
+
+class MGSM_Bengali(MGSM):
+    DATASET_NAME = "bn"
+    QUESTION = "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:"
+    ANSWER = "\u09a7\u09be\u09aa\u09c7 \u09a7\u09be\u09aa\u09c7 \u0989\u09a4\u09cd\u09a4\u09b0:"
+
+
+class MGSM_Telugu(MGSM):
+    DATASET_NAME = "te"
+    QUESTION = "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:"
+    ANSWER = "\u0c26\u0c36\u0c32\u0c35\u0c3e\u0c30\u0c40\u0c17\u0c3e \u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02:"
+
+
+LANGS = ["en", "es", "fr", "de", "ru", "zh", "ja", "th", "sw", "bn", "te"]
+
+LANG_CLASSES = [
+    MGSM_English,
+    MGSM_Spanish,
+    MGSM_French,
+    MGSM_German,
+    MGSM_Russian,
+    MGSM_Chinese,
+    MGSM_Japanese,
+    MGSM_Thai,
+    MGSM_Swahili,
+    MGSM_Bengali,
+    MGSM_Telugu,
+]
+
+
+def construct_tasks():
+    tasks = {}
+    for lang, lang_class in zip(LANGS, LANG_CLASSES):
+        tasks[f"mgsm_{lang}"] = lang_class
+    return tasks
--- a/lm_eval/tasks/pawsx.py
+++ b/lm_eval/tasks/pawsx.py
+"""
+PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification
+https://arxiv.org/abs/1908.11828
+
+The dataset consists of 23,659 human translated PAWS evaluation pairs and
+296,406 machine translated training pairs in 6 typologically distinct languages.
+
+Examples are adapted from  PAWS-Wiki
+
+Prompt format (same as in mGPT):
+
+"<s>" + sentence1 + ", right? " + mask + ", " + sentence2 + "</s>",
+
+where mask is the string that matches the label:
+
+Yes, No.
+
+Example:
+
+<s> The Tabaci River is a tributary of the River Leurda in Romania, right? No, The Leurda River is a tributary of the River Tabaci in Romania.</s>
+
+Language specific prompts are translated word-by-word with Google Translate
+and may differ from the ones used by mGPT and XGLM (they do not provide their prompts).
+
+Homepage: https://github.com/google-research-datasets/paws/tree/master/pawsx
+"""
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
+
+_CITATION = """
+@inproceedings{yang-etal-2019-paws,
+    title = "{PAWS}-{X}: A Cross-lingual Adversarial Dataset for Paraphrase Identification",
+    author = "Yang, Yinfei  and
+      Zhang, Yuan  and
+      Tar, Chris  and
+      Baldridge, Jason",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
+    month = nov,
+    year = "2019",
+    address = "Hong Kong, China",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D19-1382",
+    doi = "10.18653/v1/D19-1382",
+    pages = "3687--3692",
+}"""
+
+
+class PAWSXBase(Task):
+    VERSION = 0
+    DATASET_PATH = "paws-x"
+    DATASET_NAME = None  # 'en'
+
+    YES = None  # 'Yes'
+    NO = None  # 'No'
+    QUESTION_WORD = None  # 'right'
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        # same as in mGPT paper
+        return (
+            doc["sentence1"]
+            + ", "
+            + self.QUESTION_WORD
+            + "? [MASK], "
+            + doc["sentence2"]
+        )
+
+    def doc_to_target(self, doc):
+        return " " + [self.YES, self.NO][doc["label"]]
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or
+            test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+
+        ll_yes = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.YES))
+        ll_no = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.NO))
+
+        return ll_yes, ll_no
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        ll_yes, ll_no = results
+
+        pred = ll_yes > ll_no
+
+        true_label = doc["label"]
+
+        return {
+            "acc": pred == true_label,
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [metric_score] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metric scores
+        """
+        return {
+            "acc": mean,
+        }
+
+    def higher_is_better(self):
+        return {"acc": True}
+
+
+class PAWSX_en(PAWSXBase):
+    DATASET_NAME = "en"
+    YES = "Yes"
+    NO = "No"
+    QUESTION_WORD = "right"
+
+
+class PAWSX_de(PAWSXBase):
+    DATASET_NAME = "de"
+    YES = "Ja"
+    NO = "Nein"
+    QUESTION_WORD = "richtig"
+
+
+class PAWSX_fr(PAWSXBase):
+    DATASET_NAME = "fr"
+    YES = "Oui"
+    NO = "No"
+    QUESTION_WORD = "right"
+
+
+class PAWSX_es(PAWSXBase):
+    DATASET_NAME = "es"
+    YES = "Sí"
+    NO = "No"
+    QUESTION_WORD = "verdad"
+
+
+class PAWSX_ja(PAWSXBase):
+    DATASET_NAME = "ja"
+    YES = "はい"
+    NO = "いいえ"
+    QUESTION_WORD = "ですね"
+
+
+class PAWSX_ko(PAWSXBase):
+    DATASET_NAME = "ko"
+    YES = "예"
+    NO = "아니요"
+    QUESTION_WORD = "맞죠"
+
+
+class PAWSX_zh(PAWSXBase):
+    DATASET_NAME = "zh"
+    YES = "是"
+    NO = "不是"
+    QUESTION_WORD = "对吧"
+
+
+LANGS = [
+    "en",
+    "de",
+    "es",
+    "fr",
+    "ja",
+    "ko",
+    "zh",
+]
+
+LANG_CLASSES = [
+    PAWSX_en,
+    PAWSX_de,
+    PAWSX_es,
+    PAWSX_fr,
+    PAWSX_ja,
+    PAWSX_ko,
+    PAWSX_zh,
+]
+
+
+def construct_tasks():
+    tasks = {}
+    for lang, lang_class in zip(LANGS, LANG_CLASSES):
+        tasks[f"pawsx_{lang}"] = lang_class
+    return tasks
--- a/lm_eval/tasks/qasper.py
+++ b/lm_eval/tasks/qasper.py
@@ -214,7 +214,7 @@ class QASPER(Task):
        """
        # unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
        if doc["answer_type"] in ("free form answer"):
-            return [rf.greedy_until(ctx, {'until': ["\n"]})]
+            return [rf.greedy_until(ctx, {"until": ["\n"]})]
        elif doc["answer_type"] in ("bool"):
            ll_yes, _ = rf.loglikelihood(ctx, " yes")
            ll_no, _ = rf.loglikelihood(ctx, " no")

--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
@@ -107,7 +107,7 @@ class SQuAD2(Task):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        """
-        continuation = rf.greedy_until(ctx, {'until': ["\n"]})
+        continuation = rf.greedy_until(ctx, {"until": ["\n"]})
        is_unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
        return continuation, is_unanswerable


--- a/lm_eval/tasks/translation.py
+++ b/lm_eval/tasks/translation.py
@@ -184,7 +184,7 @@ class GeneralTranslationTask(Task):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        """
-        return rf.greedy_until(ctx, {'until': ["\n"]})
+        return rf.greedy_until(ctx, {"until": ["\n"]})

    def process_results(self, doc, results):
        # Add spaces between words for BLEU score calculation of target languages like Chinese

--- a/lm_eval/tasks/truthfulqa.py
+++ b/lm_eval/tasks/truthfulqa.py
@@ -247,7 +247,7 @@ class TruthfulQAGeneration(Task):
            part of the document for `doc`.
        """
        # TODO: Find a way to cap the number of generated tokens to `50` as in the official implementation.
-        completion = rf.greedy_until(ctx, {'until': ["."]})
+        completion = rf.greedy_until(ctx, {"until": ["."]})
        return completion

    def process_results(self, doc, results):

--- a/lm_eval/tasks/unscramble.py
+++ b/lm_eval/tasks/unscramble.py
@@ -59,7 +59,7 @@ class WordUnscrambleTask(Task):
        return doc["completion"]

    def construct_requests(self, doc, ctx):
-        completion = rf.greedy_until(ctx, {'until': ["\n"]})
+        completion = rf.greedy_until(ctx, {"until": ["\n"]})
        return completion

    def process_results(self, doc, results):

--- a/lm_eval/tasks/xcopa.py
+++ b/lm_eval/tasks/xcopa.py
+"""
+XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning
+https://ducdauge.github.io/files/xcopa.pdf
+
+The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages.
+The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe.
+The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages.
+All the details about the creation of XCOPA and the implementation of the baselines are available in the paper.
+
+Homepage: https://github.com/cambridgeltl/xcopa
+"""
+from .superglue import Copa
+
+
+_CITATION = """
+@inproceedings{ponti2020xcopa,
+  title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
+  author={Edoardo M. Ponti, Goran Glava\v{s}, Olga Majewska, Qianchu Liu, Ivan Vuli\'{c} and Anna Korhonen},
+  booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+  year={2020},
+  url={https://ducdauge.github.io/files/xcopa.pdf}
+}
+"""
+
+
+class XCopa(Copa):
+    VERSION = 0
+    DATASET_PATH = "xcopa"
+    DATASET_NAME = None
+    CAUSE = "because"
+    EFFECT = "therefore"
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        # Drop the period
+        connector = {
+            "cause": self.CAUSE,
+            "effect": self.EFFECT,
+        }[doc["question"]]
+        return doc["premise"].strip()[:-1] + f" {connector}"
+
+
+class XCopaEt(XCopa):
+    DATASET_NAME = "et"
+    CAUSE = "sest"
+    EFFECT = "seetõttu"
+
+
+class XCopaHt(XCopa):
+    DATASET_NAME = "ht"
+    CAUSE = "poukisa"
+    EFFECT = "donk sa"
+
+
+class XCopaIt(XCopa):
+    DATASET_NAME = "it"
+    CAUSE = "perché"
+    EFFECT = "quindi"
+
+
+class XCopaId(XCopa):
+    DATASET_NAME = "id"
+    CAUSE = "karena"
+    EFFECT = "maka"
+
+
+class XCopaQu(XCopa):
+    DATASET_NAME = "qu"
+    CAUSE = "imataq"
+    EFFECT = "chaymi"
+
+
+class XCopaSw(XCopa):
+    DATASET_NAME = "sw"
+    CAUSE = "kwa sababu"
+    EFFECT = "kwa hiyo"
+
+
+class XCopaZh(XCopa):
+    DATASET_NAME = "zh"
+    CAUSE = "因为"
+    EFFECT = "所以"
+
+
+class XCopaTa(XCopa):
+    DATASET_NAME = "ta"
+    CAUSE = "காரணமாக"
+    EFFECT = "எனவே"
+
+
+class XCopaTh(XCopa):
+    DATASET_NAME = "th"
+    CAUSE = "เพราะ"
+    EFFECT = "ดังนั้น"
+
+
+class XCopaTr(XCopa):
+    DATASET_NAME = "tr"
+    CAUSE = "çünkü"
+    EFFECT = "bu yüzden"
+
+
+class XCopaVi(XCopa):
+    DATASET_NAME = "vi"
+    CAUSE = "bởi vì"
+    EFFECT = "vì vậy"
+
+
+LANGS = ["et", "ht", "it", "id", "qu", "sw", "zh", "ta", "th", "tr", "vi"]
+
+LANG_CLASSES = [
+    XCopaEt,
+    XCopaHt,
+    XCopaIt,
+    XCopaId,
+    XCopaQu,
+    XCopaSw,
+    XCopaZh,
+    XCopaTa,
+    XCopaTh,
+    XCopaTr,
+    XCopaVi,
+]
+
+
+def construct_tasks():
+    tasks = {}
+    for lang, lang_class in zip(LANGS, LANG_CLASSES):
+        tasks[f"xcopa_{lang}"] = lang_class
+    return tasks
--- a/lm_eval/tasks/xnli.py
+++ b/lm_eval/tasks/xnli.py
+"""
+XNLI: Evaluating Cross-lingual Sentence Representations
+https://arxiv.org/abs/1809.05053
+
+Based on the implementation of @yongzx (see https://github.com/EleutherAI/lm-evaluation-harness/pull/258)
+
+Prompt format (same as XGLM and mGPT):
+
+sentence1 + ", right? " + mask = (Yes|Also|No) + ", " + sentence2
+
+Predicition is the full sequence with the highest likelihood.
+
+Language specific prompts are translated word-by-word with Google Translate
+and may differ from the ones used by mGPT and XGLM (they do not provide their prompts).
+
+Homepage: https://github.com/facebookresearch/XNLI
+"""
+import numpy as np
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+_CITATIONS = """
+@InProceedings{conneau2018xnli,
+  author = "Conneau, Alexis
+        and Rinott, Ruty
+        and Lample, Guillaume
+        and Williams, Adina
+        and Bowman, Samuel R.
+        and Schwenk, Holger
+        and Stoyanov, Veselin",
+  title = "XNLI: Evaluating Cross-lingual Sentence Representations",
+  booktitle = "Proceedings of the 2018 Conference on Empirical Methods
+               in Natural Language Processing",
+  year = "2018",
+  publisher = "Association for Computational Linguistics",
+  location = "Brussels, Belgium",
+}
+"""
+
+
+class XNLIBase(Task):
+    VERSION = 0
+    DATASET_PATH = "xnli"
+    DATASET_NAME = None
+
+    QUESTION_WORD = None  # 'right'
+    ENTAILMENT_LABEL = None  # 'Yes'
+    NEUTRAL_LABEL = None  # 'Also'
+    CONTRADICTION_LABEL = None  # 'No'
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        # Example:
+        # The girl that can help me is all the way across town, right? Yes, The girl I need help from lives a ways away.
+        # [MASK] is replaced with ENTAILMENT_LABEL, NEUTRAL_LABEL, or CONTRADICTION_LABEL
+        return (
+            doc["premise"]
+            + ", "
+            + self.QUESTION_WORD
+            + "? [MASK], "
+            + doc["hypothesis"]
+        )
+
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return (
+            " "
+            + [self.ENTAILMENT_LABEL, self.NEUTRAL_LABEL, self.CONTRADICTION_LABEL][
+                doc["label"]
+            ]
+        )
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        ll_true = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.ENTAILMENT_LABEL))
+        ll_neither = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.NEUTRAL_LABEL))
+        ll_false = rf.loglikelihood_rolling(
+            ctx.replace("[MASK]", self.CONTRADICTION_LABEL)
+        )
+
+        return ll_true, ll_neither, ll_false
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        gold = doc["label"]
+        pred = np.argmax(results)
+        return {"acc": pred == gold}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {"acc": mean}
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {"acc": True}
+
+
+class XNLI_en(XNLIBase):  # English
+    DATASET_NAME = "en"
+
+    QUESTION_WORD = "right"
+    ENTAILMENT_LABEL = "Yes"
+    NEUTRAL_LABEL = "Also"
+    CONTRADICTION_LABEL = "No"
+
+
+class XNLI_de(XNLIBase):  # German
+    DATASET_NAME = "de"
+
+    QUESTION_WORD = "richtig"
+    ENTAILMENT_LABEL = "Ja"
+    NEUTRAL_LABEL = "Auch"
+    CONTRADICTION_LABEL = "Nein"
+
+
+class XNLI_ar(XNLIBase):  # Arabic
+    DATASET_NAME = "ar"
+
+    QUESTION_WORD = "صحيح"
+    ENTAILMENT_LABEL = "نعم"
+    NEUTRAL_LABEL = "لذا"
+    CONTRADICTION_LABEL = "رقم"
+
+
+class XNLI_bg(XNLIBase):  # Bulgarian
+    DATASET_NAME = "bg"
+
+    QUESTION_WORD = "правилно"
+    ENTAILMENT_LABEL = "да"
+    NEUTRAL_LABEL = "така"
+    CONTRADICTION_LABEL = "не"
+
+
+class XNLI_el(XNLIBase):  # Greek
+    DATASET_NAME = "el"
+
+    QUESTION_WORD = "σωστός"
+    ENTAILMENT_LABEL = "Ναί"
+    NEUTRAL_LABEL = "Έτσι"
+    CONTRADICTION_LABEL = "όχι"
+
+
+class XNLI_es(XNLIBase):  # Spanish
+    DATASET_NAME = "es"
+
+    QUESTION_WORD = "correcto"
+    ENTAILMENT_LABEL = "Sí"
+    NEUTRAL_LABEL = "Asi que"
+    CONTRADICTION_LABEL = "No"
+
+
+class XNLI_fr(XNLIBase):  # French
+    DATASET_NAME = "fr"
+
+    QUESTION_WORD = "correct"
+    ENTAILMENT_LABEL = "Oui"
+    NEUTRAL_LABEL = "Aussi"
+    CONTRADICTION_LABEL = "Non"
+
+
+class XNLI_hi(XNLIBase):  # Hindi
+    DATASET_NAME = "hi"
+
+    QUESTION_WORD = "सही"
+    ENTAILMENT_LABEL = "हाँ"
+    NEUTRAL_LABEL = "इसलिए"
+    CONTRADICTION_LABEL = "नहीं"
+
+
+class XNLI_ru(XNLIBase):  # Russian
+    DATASET_NAME = "ru"
+
+    QUESTION_WORD = "правильно"
+    ENTAILMENT_LABEL = "Да"
+    NEUTRAL_LABEL = "Так"
+    CONTRADICTION_LABEL = "Нет"
+
+
+class XNLI_sw(XNLIBase):  # Swahili
+    DATASET_NAME = "sw"
+
+    QUESTION_WORD = "sahihi"
+    ENTAILMENT_LABEL = "Ndiyo"
+    NEUTRAL_LABEL = "Hivyo"
+    CONTRADICTION_LABEL = "Hapana"
+
+
+class XNLI_th(XNLIBase):  # Thai
+    DATASET_NAME = "th"
+
+    QUESTION_WORD = "ถูกต้อง"
+    ENTAILMENT_LABEL = "ใช่"
+    NEUTRAL_LABEL = "ดังนั้น"
+    CONTRADICTION_LABEL = "ไม่"
+
+
+class XNLI_tr(XNLIBase):  # Turkish
+    DATASET_NAME = "tr"
+
+    QUESTION_WORD = "doğru"
+    ENTAILMENT_LABEL = "Evet"
+    NEUTRAL_LABEL = "Böylece"
+    CONTRADICTION_LABEL = "Hayır"
+
+
+class XNLI_ur(XNLIBase):  # Urdu
+    DATASET_NAME = "ur"
+
+    QUESTION_WORD = "صحیح"
+    ENTAILMENT_LABEL = "جی ہاں"
+    NEUTRAL_LABEL = "اس لئے"
+    CONTRADICTION_LABEL = "نہیں"
+
+
+class XNLI_vi(XNLIBase):  # Vietnamese
+    DATASET_NAME = "vi"
+
+    QUESTION_WORD = "đúng"
+    ENTAILMENT_LABEL = "Vâng"
+    NEUTRAL_LABEL = "Vì vậy"
+    CONTRADICTION_LABEL = "Không"
+
+
+class XNLI_zh(XNLIBase):  # Chinese
+    DATASET_NAME = "zh"
+
+    QUESTION_WORD = "正确"
+    ENTAILMENT_LABEL = "是的"
+    NEUTRAL_LABEL = "所以"
+    CONTRADICTION_LABEL = "不是的"
+
+
+LANGS = [
+    "ar",
+    "bg",
+    "de",
+    "el",
+    "en",
+    "es",
+    "fr",
+    "hi",
+    "ru",
+    "sw",
+    "th",
+    "tr",
+    "ur",
+    "vi",
+    "zh",
+]
+
+LANG_CLASSES = [
+    XNLI_ar,
+    XNLI_bg,
+    XNLI_de,
+    XNLI_el,
+    XNLI_en,
+    XNLI_es,
+    XNLI_fr,
+    XNLI_hi,
+    XNLI_ru,
+    XNLI_sw,
+    XNLI_th,
+    XNLI_tr,
+    XNLI_ur,
+    XNLI_vi,
+    XNLI_zh,
+]
+
+
+def construct_tasks():
+    tasks = {}
+    for lang, lang_class in zip(LANGS, LANG_CLASSES):
+        tasks[f"xnli_{lang}"] = lang_class
+    return tasks
--- a/lm_eval/tasks/xstorycloze.py
+++ b/lm_eval/tasks/xstorycloze.py
+"""
+Few-shot Learning with Multilingual Language Models
+https://arxiv.org/abs/2112.10668
+
+XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.
+Homepage: https://github.com/facebookresearch/fairseq/pull/4820
+"""
+from .storycloze import StoryCloze
+
+
+_CITATION = """
+@article{DBLP:journals/corr/abs-2112-10668,
+  author    = {Xi Victoria Lin and
+               Todor Mihaylov and
+               Mikel Artetxe and
+               Tianlu Wang and
+               Shuohui Chen and
+               Daniel Simig and
+               Myle Ott and
+               Naman Goyal and
+               Shruti Bhosale and
+               Jingfei Du and
+               Ramakanth Pasunuru and
+               Sam Shleifer and
+               Punit Singh Koura and
+               Vishrav Chaudhary and
+               Brian O'Horo and
+               Jeff Wang and
+               Luke Zettlemoyer and
+               Zornitsa Kozareva and
+               Mona T. Diab and
+               Veselin Stoyanov and
+               Xian Li},
+  title     = {Few-shot Learning with Multilingual Language Models},
+  journal   = {CoRR},
+  volume    = {abs/2112.10668},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2112.10668},
+  eprinttype = {arXiv},
+  eprint    = {2112.10668},
+  timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+"""
+
+_LANG = ["en", "ru", "zh", "es", "ar", "hi", "id", "te", "sw", "eu", "my"]
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of subjects
+    :return: {task_name: task}
+    """
+    return {f"xstory_cloze_{lang}": create_task(lang) for lang in _LANG}
+
+
+def create_task(lang):
+    class XStoryCloze(StoryCloze):
+        DATASET_PATH = "juletxara/xstory_cloze"
+        DATASET_NAME = lang
+
+        def __init__(self):
+            super().__init__(data_dir="")
+
+        def has_training_docs(self):
+            return True
+
+        def has_validation_docs(self):
+            return True
+
+        def has_test_docs(self):
+            return False
+
+        def training_docs(self):
+            return self.dataset["train"]
+
+        def validation_docs(self):
+            return self.dataset["eval"]
+
+        def test_docs(self):
+            pass
+
+    return XStoryCloze
--- a/lm_eval/tasks/xwinograd.py
+++ b/lm_eval/tasks/xwinograd.py
+"""
+It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning
+https://arxiv.org/abs/2106.12066
+
+Multilingual winograd schema challenge that includes English, French, Japanese, Portuguese, Russian and Chinese. Winograd schema challenges come from the XWinograd dataset introduced in Tikhonov et al. As it only contains 16 Chinese schemas, we add 488 Chinese schemas from clue/cluewsc2020.
+
+Homepage: https://huggingface.co/datasets/Muennighoff/xwinograd
+"""
+from .winogrande import Winogrande
+
+
+_CITATION = """
+@misc{muennighoff2022crosslingual,
+      title={Crosslingual Generalization through Multitask Finetuning},
+      author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
+      year={2022},
+      eprint={2211.01786},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{tikhonov2021heads,
+    title={It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning},
+    author={Alexey Tikhonov and Max Ryabinin},
+    year={2021},
+    eprint={2106.12066},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+_LANG = ["en", "fr", "jp", "pt", "ru", "zh"]
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of subjects
+    :return: {task_name: task}
+    """
+    return {f"xwinograd_{lang}": create_task(lang) for lang in _LANG}
+
+
+def create_task(lang):
+    class XWinograd(Winogrande):
+        DATASET_PATH = "Muennighoff/xwinograd"
+        DATASET_NAME = lang
+
+        def __init__(self):
+            super().__init__()
+
+        def has_training_docs(self):
+            return False
+
+        def has_validation_docs(self):
+            return False
+
+        def has_test_docs(self):
+            return True
+
+        def training_docs(self):
+            pass
+
+        def validation_docs(self):
+            pass
+
+        def test_docs(self):
+            return self.dataset["test"]
+
+    return XWinograd
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -21,6 +21,29 @@ def sh(x):
        raise ExitCodeError()


+def escaped_split(text, sep_char, maxsplit=-1):
+    """Split text into a list on occurrences of the given separation
+    character `sep_char`. The separation character may be escaped by a
+    backslash to avoid splitting at that location.
+
+    The separation character must be a string of size 1.
+
+    If `maxsplit` is given, at most `maxsplit` splits are done (thus,
+    the list will have at most `maxsplit + 1` elements). If `maxsplit`
+    is not specified or less than 0, then there is no limit on the
+    number of splits (all possible splits are made).
+    """
+    assert (
+        len(sep_char) == 1
+    ), "separation string must be a single character for escaped splitting"
+
+    if maxsplit == 0:
+        return text
+    maxsplit = max(0, maxsplit)
+
+    return re.split(r"(?<!\\)" + sep_char, text, maxsplit)
+
+
 def simple_parse_args_string(args_string):
    """
    Parses something like
@@ -223,4 +246,3 @@ def run_task_tests(task_list: List[str]):
        raise ValueError(
            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
        )
-
--- a/main.py
+++ b/main.py
@@ -2,12 +2,17 @@ import argparse
 import json
 import logging
 import fnmatch
+import os

 from lm_eval import tasks, evaluator

 logging.getLogger("openai").setLevel(logging.WARNING)


+def _is_json_task(task_name):
+    return task_name == "json" or task_name.startswith("json=")
+
+
 class MultiChoice:
    def __init__(self, choices):
        self.choices = choices
@@ -15,7 +20,9 @@ class MultiChoice:
    # Simple wildcard support (linux filename patterns)
    def __contains__(self, values):
        for value in values.split(","):
-            if len(fnmatch.filter(self.choices, value)) == 0:
+            if len(fnmatch.filter(self.choices, value)) == 0 and not _is_json_task(
+                value
+            ):
                return False

        return True
@@ -35,11 +42,16 @@ def parse_args():
    parser.add_argument("--batch_size", type=str, default=None)
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument("--limit", type=float, default=None,
+                        help="Limit the number of examples per task. "
+                             "If <1, limit is a percentage of the total number of examples.")
+    parser.add_argument("--data_sampling", type=float, default=None)
    parser.add_argument("--no_cache", action="store_true")
    parser.add_argument("--decontamination_ngrams_path", default=None)
    parser.add_argument("--description_dict_path", default=None)
    parser.add_argument("--check_integrity", action="store_true")
+    parser.add_argument("--write_out", action="store_true", default=False)
+    parser.add_argument("--output_base_path", type=str, default=None)

    return parser.parse_args()

@@ -49,6 +61,9 @@ def parse_args():
 def pattern_match(patterns, source_list):
    task_names = set()
    for pattern in patterns:
+        if _is_json_task(pattern):
+            task_names.add(pattern)
+
        for matching in fnmatch.filter(source_list, pattern):
            task_names.add(matching)
    return sorted(list(task_names))
@@ -88,12 +103,15 @@ def main():
        description_dict=description_dict,
        decontamination_ngrams_path=args.decontamination_ngrams_path,
        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        output_base_path=args.output_base_path,
    )

    dumped = json.dumps(results, indent=2)
    print(dumped)

    if args.output_path:
+        os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
        with open(args.output_path, "w") as f:
            f.write(dumped)


--- a/results/bloom/bloom-1b1/README.md
+++ b/results/bloom/bloom-1b1/README.md
+# bloom-1b1
+
+## bloom-1b1_common_sense_reasoning_0-shot.json
+|    Task     |Version| Metric |Value|   |Stderr|
+|-------------|------:|--------|----:|---|-----:|
+|arc_challenge|      0|acc     |23.63|±  |  1.24|
+|             |       |acc_norm|25.68|±  |  1.28|
+|arc_easy     |      0|acc     |51.47|±  |  1.03|
+|             |       |acc_norm|45.45|±  |  1.02|
+|boolq        |      1|acc     |59.08|±  |  0.86|
+|copa         |      0|acc     |68.00|±  |  4.69|
+|hellaswag    |      0|acc     |34.63|±  |  0.47|
+|             |       |acc_norm|41.77|±  |  0.49|
+|mc_taco      |      0|em      |14.49|   |      |
+|             |       |f1      |32.43|   |      |
+|openbookqa   |      0|acc     |19.60|±  |  1.78|
+|             |       |acc_norm|29.40|±  |  2.04|
+|piqa         |      0|acc     |67.14|±  |  1.10|
+|             |       |acc_norm|67.14|±  |  1.10|
+|prost        |      0|acc     |23.41|±  |  0.31|
+|             |       |acc_norm|30.50|±  |  0.34|
+|swag         |      0|acc     |43.43|±  |  0.35|
+|             |       |acc_norm|58.28|±  |  0.35|
+|winogrande   |      0|acc     |54.93|±  |  1.40|
+|wsc273       |      0|acc     |68.50|±  |  2.82|
+
+## bloom-1b1_gsm8k_8-shot.json
+|Task |Version|Metric|Value|   |Stderr|
+|-----|------:|------|----:|---|-----:|
+|gsm8k|      0|acc   | 0.83|±  |  0.25|
+
+## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+|          Task           |Version| Metric |Value|   |Stderr|
+|-------------------------|------:|--------|----:|---|-----:|
+|drop                     |      1|em      | 1.38|±  |  0.12|
+|                         |       |f1      | 4.01|±  |  0.15|
+|gsm8k                    |      0|acc     | 0.00|±  |  0.00|
+|math_algebra             |      1|acc     | 0.00|±  |  0.00|
+|math_counting_and_prob   |      1|acc     | 0.21|±  |  0.21|
+|math_geometry            |      1|acc     | 0.21|±  |  0.21|
+|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
+|math_num_theory          |      1|acc     | 0.19|±  |  0.19|
+|math_prealgebra          |      1|acc     | 0.11|±  |  0.11|
+|math_precalc             |      1|acc     | 0.00|±  |  0.00|
+|mathqa                   |      0|acc     |23.55|±  |  0.78|
+|                         |       |acc_norm|23.62|±  |  0.78|
+
+## bloom-1b1_pawsx_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|pawsx_de|      0|acc   |46.95|±  |  1.12|
+|pawsx_en|      0|acc   |52.45|±  |  1.12|
+|pawsx_es|      0|acc   |51.50|±  |  1.12|
+|pawsx_fr|      0|acc   |46.15|±  |  1.11|
+|pawsx_ja|      0|acc   |48.40|±  |  1.12|
+|pawsx_ko|      0|acc   |49.90|±  |  1.12|
+|pawsx_zh|      0|acc   |48.95|±  |  1.12|
+
+## bloom-1b1_question_answering_0-shot.json
+|    Task     |Version|   Metric   |Value|   |Stderr|
+|-------------|------:|------------|----:|---|-----:|
+|headqa_en    |      0|acc         |26.44|±  |  0.84|
+|             |       |acc_norm    |30.49|±  |  0.88|
+|headqa_es    |      0|acc         |24.43|±  |  0.82|
+|             |       |acc_norm    |28.30|±  |  0.86|
+|logiqa       |      0|acc         |18.89|±  |  1.54|
+|             |       |acc_norm    |25.65|±  |  1.71|
+|squad2       |      1|exact       | 4.17|   |      |
+|             |       |f1          | 6.60|   |      |
+|             |       |HasAns_exact| 2.19|   |      |
+|             |       |HasAns_f1   | 7.05|   |      |
+|             |       |NoAns_exact | 6.14|   |      |
+|             |       |NoAns_f1    | 6.14|   |      |
+|             |       |best_exact  |50.07|   |      |
+|             |       |best_f1     |50.07|   |      |
+|triviaqa     |      1|acc         | 2.68|±  |  0.15|
+|truthfulqa_mc|      1|mc1         |25.34|±  |  1.52|
+|             |       |mc2         |41.80|±  |  1.46|
+|webqs        |      0|acc         | 1.38|±  |  0.26|
+
+## bloom-1b1_reading_comprehension_0-shot.json
+|Task|Version|Metric|Value|   |Stderr|
+|----|------:|------|----:|---|-----:|
+|coqa|      1|f1    |45.57|±  |  1.88|
+|    |       |em    |32.98|±  |  1.95|
+|drop|      1|em    | 3.31|±  |  0.18|
+|    |       |f1    | 8.63|±  |  0.22|
+|race|      1|acc   |32.63|±  |  1.45|
+
+## bloom-1b1_xcopa_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|xcopa_et|      0|acc   | 50.6|±  |  2.24|
+|xcopa_ht|      0|acc   | 53.0|±  |  2.23|
+|xcopa_id|      0|acc   | 64.8|±  |  2.14|
+|xcopa_it|      0|acc   | 50.8|±  |  2.24|
+|xcopa_qu|      0|acc   | 51.2|±  |  2.24|
+|xcopa_sw|      0|acc   | 54.4|±  |  2.23|
+|xcopa_ta|      0|acc   | 57.0|±  |  2.22|
+|xcopa_th|      0|acc   | 53.2|±  |  2.23|
+|xcopa_tr|      0|acc   | 53.0|±  |  2.23|
+|xcopa_vi|      0|acc   | 62.4|±  |  2.17|
+|xcopa_zh|      0|acc   | 59.4|±  |  2.20|
+
+## bloom-1b1_xnli_0-shot.json
+| Task  |Version|Metric|Value|   |Stderr|
+|-------|------:|------|----:|---|-----:|
+|xnli_ar|      0|acc   |33.93|±  |  0.67|
+|xnli_bg|      0|acc   |34.13|±  |  0.67|
+|xnli_de|      0|acc   |39.64|±  |  0.69|
+|xnli_el|      0|acc   |34.03|±  |  0.67|
+|xnli_en|      0|acc   |51.48|±  |  0.71|
+|xnli_es|      0|acc   |47.98|±  |  0.71|
+|xnli_fr|      0|acc   |47.15|±  |  0.71|
+|xnli_hi|      0|acc   |42.32|±  |  0.70|
+|xnli_ru|      0|acc   |40.46|±  |  0.69|
+|xnli_sw|      0|acc   |35.29|±  |  0.68|
+|xnli_th|      0|acc   |33.75|±  |  0.67|
+|xnli_tr|      0|acc   |34.79|±  |  0.67|
+|xnli_ur|      0|acc   |37.33|±  |  0.68|
+|xnli_vi|      0|acc   |44.45|±  |  0.70|
+|xnli_zh|      0|acc   |36.23|±  |  0.68|
+
+## bloom-1b1_xstory_cloze_0-shot.json
+|     Task      |Version|Metric|Value|   |Stderr|
+|---------------|------:|------|----:|---|-----:|
+|xstory_cloze_ar|      0|acc   |52.88|±  |  1.28|
+|xstory_cloze_en|      0|acc   |62.54|±  |  1.25|
+|xstory_cloze_es|      0|acc   |58.31|±  |  1.27|
+|xstory_cloze_eu|      0|acc   |54.33|±  |  1.28|
+|xstory_cloze_hi|      0|acc   |55.53|±  |  1.28|
+|xstory_cloze_id|      0|acc   |57.91|±  |  1.27|
+|xstory_cloze_my|      0|acc   |46.19|±  |  1.28|
+|xstory_cloze_ru|      0|acc   |48.25|±  |  1.29|
+|xstory_cloze_sw|      0|acc   |50.56|±  |  1.29|
+|xstory_cloze_te|      0|acc   |56.39|±  |  1.28|
+|xstory_cloze_zh|      0|acc   |58.04|±  |  1.27|
+
+## bloom-1b1_xwinograd_0-shot.json
+|    Task    |Version|Metric|Value|   |Stderr|
+|------------|------:|------|----:|---|-----:|
+|xwinograd_en|      0|acc   |69.98|±  |  0.95|
+|xwinograd_fr|      0|acc   |66.27|±  |  5.22|
+|xwinograd_jp|      0|acc   |52.87|±  |  1.61|
+|xwinograd_pt|      0|acc   |63.12|±  |  2.98|
+|xwinograd_ru|      0|acc   |54.29|±  |  2.81|
+|xwinograd_zh|      0|acc   |69.25|±  |  2.06|
--- a/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+{
+  "results": {
+    "boolq": {
+      "acc": 0.5908256880733945,
+      "acc_stderr": 0.008599563442397352
+    },
+    "arc_easy": {
+      "acc": 0.5147306397306397,
+      "acc_stderr": 0.010255329977562096,
+      "acc_norm": 0.45454545454545453,
+      "acc_norm_stderr": 0.010217299762709435
+    },
+    "openbookqa": {
+      "acc": 0.196,
+      "acc_stderr": 0.017770751227744862,
+      "acc_norm": 0.294,
+      "acc_norm_stderr": 0.020395095484936614
+    },
+    "hellaswag": {
+      "acc": 0.3463453495319657,
+      "acc_stderr": 0.004748324319714264,
+      "acc_norm": 0.4177454690300737,
+      "acc_norm_stderr": 0.004921798492608764
+    },
+    "swag": {
+      "acc": 0.43431970408877335,
+      "acc_stderr": 0.0035044592489844794,
+      "acc_norm": 0.5828251524542637,
+      "acc_norm_stderr": 0.0034862531772295617
+    },
+    "arc_challenge": {
+      "acc": 0.2363481228668942,
+      "acc_stderr": 0.012414960524301834,
+      "acc_norm": 0.2568259385665529,
+      "acc_norm_stderr": 0.0127669237941168
+    },
+    "mc_taco": {
+      "em": 0.1448948948948949,
+      "f1": 0.32425976796237205
+    },
+    "wsc273": {
+      "acc": 0.684981684981685,
+      "acc_stderr": 0.028165854394193602
+    },
+    "winogrande": {
+      "acc": 0.5493291239147593,
+      "acc_stderr": 0.013983928869040239
+    },
+    "prost": {
+      "acc": 0.23409479077711356,
+      "acc_stderr": 0.003093545711826552,
+      "acc_norm": 0.3049743808710504,
+      "acc_norm_stderr": 0.003363606918420179
+    },
+    "copa": {
+      "acc": 0.68,
+      "acc_stderr": 0.04688261722621504
+    },
+    "piqa": {
+      "acc": 0.6713819368879217,
+      "acc_stderr": 0.010959127105167048,
+      "acc_norm": 0.6713819368879217,
+      "acc_norm_stderr": 0.010959127105167044
+    }
+  },
+  "versions": {
+    "boolq": 1,
+    "arc_easy": 0,
+    "openbookqa": 0,
+    "hellaswag": 0,
+    "swag": 0,
+    "arc_challenge": 0,
+    "mc_taco": 0,
+    "wsc273": 0,
+    "winogrande": 0,
+    "prost": 0,
+    "copa": 0,
+    "piqa": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+{
+  "results": {
+    "gsm8k": {
+      "acc": 0.008339651250947688,
+      "acc_stderr": 0.002504942226860508
+    }
+  },
+  "versions": {
+    "gsm8k": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 8,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+{
+  "results": {
+    "mathqa": {
+      "acc": 0.2355108877721943,
+      "acc_stderr": 0.007767687364650971,
+      "acc_norm": 0.23618090452261306,
+      "acc_norm_stderr": 0.0077753193787470495
+    },
+    "gsm8k": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "drop": {
+      "em": 0.013842281879194632,
+      "em_stderr": 0.001196510970060749,
+      "f1": 0.040085989932885986,
+      "f1_stderr": 0.0014841664758736023
+    },
+    "math_geometry": {
+      "acc": 0.0020876826722338203,
+      "acc_stderr": 0.0020876826722338315
+    },
+    "math_counting_and_prob": {
+      "acc": 0.002109704641350211,
+      "acc_stderr": 0.002109704641350211
+    },
+    "math_prealgebra": {
+      "acc": 0.001148105625717566,
+      "acc_stderr": 0.0011481056257175708
+    },
+    "math_num_theory": {
+      "acc": 0.001851851851851852,
+      "acc_stderr": 0.0018518518518518448
+    },
+    "math_precalc": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "math_algebra": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "math_intermediate_algebra": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    }
+  },
+  "versions": {
+    "mathqa": 0,
+    "gsm8k": 0,
+    "drop": 1,
+    "math_geometry": 1,
+    "math_counting_and_prob": 1,
+    "math_prealgebra": 1,
+    "math_num_theory": 1,
+    "math_precalc": 1,
+    "math_algebra": 1,
+    "math_intermediate_algebra": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 5,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}