Merge branch 'master' into master

50d02961 · Stella Biderman · GitHub · e04aab6d · 5e59320b · 50d02961
Unverified Commit 50d02961 authored Apr 28, 2022 by Stella Biderman Committed by GitHub Apr 28, 2022
12 changed files
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -353,11 +353,13 @@ class BaseLM(LM):
        for context, request_args in tqdm(reord.get_reordered()):
            stopping_criteria = request_args["stopping_criteria"]
            max_generation_length = request_args["max_generation_length"]
+            num_fewshot = request_args["num_fewshot"]
            assert isinstance(stopping_criteria, str) or stopping_criteria is None
            assert (
                isinstance(max_generation_length, int) or max_generation_length is None
            )
+            assert isinstance(num_fewshot, int) or num_fewshot is None
            if stopping_criteria is None:
                until = [self.eot_token] 
@@ -382,9 +384,10 @@ class BaseLM(LM):
                context_enc,
                max_length,
                torch.tensor(primary_until),
+                num_fewshot,
            )
-            s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :])
+            s = self.tok_decode(cont.tolist())
            for term in until:
                s = s.split(term)[0]
@@ -536,7 +539,7 @@ class Task(abc.ABC):
        pass
    @abstractmethod
-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, args):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.
@@ -546,6 +549,8 @@ class Task(abc.ABC):
            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
+        :param args: dict
+            The specifics of the context, including number of few shots.
        """
        pass
@@ -724,7 +729,7 @@ class PromptSourceTask(Task):
        text, _ = self.prompt.apply(doc)
        return text
-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, args):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.
@@ -734,6 +739,8 @@ class PromptSourceTask(Task):
            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
+        :param args: dict
+            The specifics of the context, including number of few shots.
        """
        _requests = []
        answer_choices_list = self.prompt.get_answer_choices_list(doc)
@@ -749,6 +756,7 @@ class PromptSourceTask(Task):
            request_args = {
                "stopping_criteria": self.stopping_criteria(),
                "max_generation_length": self.max_generation_length(),
+                "num_fewshot": args["num_fewshot"],
            }
            cont_request = rf.greedy_until(ctx, request_args)
            _requests.append(cont_request)
@@ -915,12 +923,12 @@ class PromptSourceTask(Task):
        if num_fewshot == 0:
            labeled_examples = ""
-            fewshotex, fewshotidx, fewshotsource = [], [], None
+            fewshotex, fewshotidx, self.fewshotsource = [], [], None
        else:
            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
            if self.has_training_docs():
                fewshotex, fewshotidx = self.fewshot_examples(k=num_fewshot, rnd=rnd)
-                fewshotsource = "train"
+                self.fewshotsource = "train"
            else:
                if self._fewshot_docs is None:
                    self._fewshot_docs = list(
@@ -929,18 +937,18 @@ class PromptSourceTask(Task):
                        else self.test_docs()
                    )
                    if self.has_validation_docs():
-                        fewshotsource = "val"
+                        self.fewshotsource = "val"
                    elif self.test_docs():
-                        fewshotsource = "test"
+                        self.fewshotsource = "test"
                fewshotex, fewshotidx = self._get_fewshot_examples(
                    self._fewshot_docs, k=num_fewshot + 1, rnd=rnd
                )
-                fewshotex, fewshotidx = [
+                fewshotex, fewshotidx = zip(*[
                    (shot, idx)
                    for shot, idx in zip(fewshotex, fewshotidx)
                    if shot != doc
-                ]
+                ])
                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
                fewshotex, fewshotidx = (
                    fewshotex[:num_fewshot],
@@ -966,7 +974,7 @@ class PromptSourceTask(Task):
            ctx,
            {
                "fewshot_idx": fewshotidx,
-                "fewshot_source": fewshotsource,
+                "fewshot_source": self.fewshotsource,
                "fewshot_num": num_fewshot,
                "ctx": ctx,
            },

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -206,7 +206,8 @@ def evaluate(
                doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
            )
            fewshotex_logging_info["doc_id"] = original_doc_id
-            reqs = task.construct_requests(doc, ctx)
+            args = {"num_fewshot": num_fewshot}
+            reqs = task.construct_requests(doc, ctx, args)
            if not isinstance(reqs, (list, tuple)):
                reqs = [reqs]
            for i, req in enumerate(reqs):

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -12,6 +12,7 @@ class HFLM(BaseLM):
        subfolder=None,
        tokenizer=None,
        batch_size=1,
+        parallelize=False
    ):
        super().__init__()
@@ -32,7 +33,7 @@ class HFLM(BaseLM):
        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
            pretrained,
            revision=revision + ("/" + subfolder if subfolder is not None else ""),
-        ).to(self.device)
+        )
        self.gpt2.eval()
        # pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
@@ -68,9 +69,11 @@ class HFLM(BaseLM):
        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
        # TODO: fix multi-gpu
-        # gpus = torch.cuda.device_count()
+        if parallelize:
-        # if gpus > 1:
+            self.gpt2.parallelize()
-        #     self.gpt2 = nn.DataParallel(self.gpt2)
+            self._device = torch.device('cuda:0')
+        else:
+            self.gpt2.to(self._device)
    @property
    def eot_token(self):
@@ -146,16 +149,26 @@ class HFLM(BaseLM):
            EOSCriteria(self.tokenizer.eos_token)
        ])
-    def _model_generate(self, context, max_length, stopping_criteria_ids):
+    def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
        stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
-        return self.gpt2.generate(
-            context, 
-            max_length=max_length, 
-            stopping_criteria=stopping_criteria,
-            do_sample=False,
-        )
+        if num_fewshot == 0:
+            generations = self.gpt2.generate(
+                context, 
+                max_length=max_length, 
+                eos_token_id=self.eot_token_id,
+                do_sample=False,
+            )
+        else:
+            generations = self.gpt2.generate(
+                context, 
+                max_length=max_length, 
+                stopping_criteria=stopping_criteria,
+                do_sample=False,
+            )
+        # Remove the context from the generations
+        return generations[0, context.shape[1] :]
 # for backwards compatibility
 GPT2LM = HFLM
--- a/lm_eval/models/t5.py
+++ b/lm_eval/models/t5.py
@@ -62,7 +62,7 @@ class T5LM(BaseLM):
    @property
    def max_gen_toks(self):
-        return self.tokenizer.model_max_length
+        return 256
    @property
    def batch_size(self):
@@ -186,11 +186,21 @@ class T5LM(BaseLM):
            EOSCriteria(self.tokenizer.eos_token)
        ])
-    def _model_generate(self, context, max_length, stopping_criteria_ids):
+    def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
        stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
-        return self.t5.generate(
-            context, 
+        if num_fewshot == 0:
-            max_length=max_length, 
+            generations = self.t5.generate(
-            stopping_criteria=stopping_criteria,
+                context, 
-            do_sample=False,
+                max_length=max_length, 
-        )
+                eos_token_id=self.eot_token_id,
+                do_sample=False,
+            )
+        else:
+            generations = self.t5.generate(
+                context, 
+                max_length=max_length, 
+                stopping_criteria=stopping_criteria,
+                do_sample=False,
+            )
+        return generations[0]
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -59,7 +59,9 @@ from . import hans
 from . import gem_webnlg
 from . import gem_xsum
 from . import gem_mlsum
-# from . import e2e_nlg_cleaned
+from . import wino_bias
+from . import e2e_nlg_cleaned
+from . import gem_asset_turk
 ########################################
 # Translation tasks
@@ -113,11 +115,12 @@ TASK_REGISTRY = {
    "wsc": superglue.SGWinogradSchemaChallenge,
    # Order by benchmark/genre?
    "coqa": coqa.CoQA,
-    "GEM/web_nlg": gem_webnlg.WebNLG,
    "drop": drop.DROP,
    "lambada": lambada.LAMBADA,
    "lambada_cloze": lambada_cloze.LAMBADA_cloze,
+    **gem_webnlg.construct_tasks(),
    # multilingual lambada
+    **gem_asset_turk.construct_tasks(),
    **lambada_multilingual.construct_tasks(),
    "wikitext": wikitext.WikiText,
    # "cbt-cn": cbt.CBTCN, # disabled pending context length fix
@@ -128,7 +131,7 @@ TASK_REGISTRY = {
    # Science related
    "pubmedqa": pubmedqa.Pubmed_QA,
    "sciq": sciq.SciQ,
-    # "e2e_nlg_cleaned": e2e_nlg_cleaned.E2E_NLG_Cleaned,
+    "e2e_nlg_cleaned": e2e_nlg_cleaned.E2E_NLG_Cleaned,
    "qasper": qasper.QASPER,
    "qa4mre_2011": qa4mre.QA4MRE_2011,
    "qa4mre_2012": qa4mre.QA4MRE_2012,
@@ -312,10 +315,14 @@ TASK_REGISTRY = {
    "gem_xsum_challenge_test_nopunc": gem_xsum.GEMXSUMChallgeTestNopunc,
    "gem_xsum_challenge_test_covid": gem_xsum.GEMXSUMChallgeTestCovid,
   #LAMA
    "lama-trex":lama.Trex,
    "bigscience-lama":bigsciencelama.BigScienceLAMA,
+    # WinoBias
+    "wino_bias_type1_pro": wino_bias.WinoBiasType1Pro,
+    "wino_bias_type1_anti": wino_bias.WinoBiasType1Anti,
+    "wino_bias_type2_pro": wino_bias.WinoBiasType2Pro,
+    "wino_bias_type2_anti": wino_bias.WinoBiasType2Anti,
 }

--- a/lm_eval/tasks/e2e_nlg_cleaned.py
+++ b/lm_eval/tasks/e2e_nlg_cleaned.py
+"""
+Semantic Noise Matters for Neural Natural Language Generation
+http://arxiv.org/abs/1911.03905
+A cleaned version of the dataset from the E2E NLG Challenge.
+The dataset contains MR with restaurant attributes and corresponding descriptions.
+Homepage: https://github.com/tuetschek/e2e-cleaning
+"""
+from lm_eval.base import PromptSourceTask, rf
+from lm_eval import metrics
+_CITATION = """
+@inproceedings{dusek-etal-2019-semantic,
+    title = "Semantic Noise Matters for Neural Natural Language Generation",
+    author = "Du{\v{s}}ek, Ond{\v{r}}ej  and
+      Howcroft, David M.  and
+      Rieser, Verena",
+    booktitle = "Proceedings of the 12th International Conference on Natural Language Generation",
+    year = "2019",
+    address = "Tokyo, Japan",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/W19-8652",
+    doi = "10.18653/v1/W19-8652",
+    pages = "421--426",
+}
+"""
+# Work in progress
+class E2E_NLG_Cleaned(PromptSourceTask):
+    VERSION = 0
+    DATASET_PATH = "e2e_nlg_cleaned"
+    DATASET_NAME = None
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        if self.has_training_docs():
+            # We cache training documents in `self._training_docs` for faster
+            # few-shot processing. If the data is too large to fit in memory,
+            # return the training data as a generator instead of a list.
+            if self._training_docs is None:
+                self._training_docs = list(self.dataset["train"])
+            return self._training_docs
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]
+    def max_generation_length(self):
+        return 64
+    # def stopping_criteria(self):
+    #     return '\n\n'
+    def invalid_doc_for_prompt(self, doc) -> bool:
+        """The QA prompts are not applicable to all the examples, we want to filter these out."""
+        return self.prompt.name.endswith("_qa") or self.prompt.name == "family_friendly_yes_no"
+    def doc_to_text(self, doc) -> str:
+        # if the response is not defined in PS, the text will be a single-element list containing an empty string
+        text = self.prompt.apply(doc)[0]
+        return text
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        _requests = []
+        # NOTE: In the future, target will be a list of strings.
+        request_args = {
+            "stopping_criteria": self.stopping_criteria(),
+            "max_generation_length": self.max_generation_length(),
+        }
+        # Skip examples for which the templates are not applicable
+        if ctx != "":
+            cont_request = rf.greedy_until(ctx, request_args)
+            _requests.append(cont_request)
+        return _requests
--- a/lm_eval/tasks/gem_asset_turk.py
+++ b/lm_eval/tasks/gem_asset_turk.py
+"""
+ASSET: ASSET (Alva-Manchego et al., 2020) is multi-reference dataset
+for the evaluation of sentence simplification in English. The dataset
+uses the same 2,359 sentences from TurkCorpus (Xu et al., 2016)
+and each sentence is associated with 10 crowdsourced simplifications.
+Unlike previous simplification datasets, which contain a single
+transformation (e.g., lexical paraphrasing in TurkCorpus or sentence
+splitting in HSplit), the simplifications in ASSET encompass a variety
+of rewriting transformations.
+https://aclanthology.org/2020.acl-main.424.pdf
+TurkCorpus: TURKCorpus is a multi-reference dataset for the evaluation of
+sentence simplification in English. The dataset consists of 2,359 sentences
+from the Parallel Wikipedia Simplification (PWKP) corpus. Each sentence is
+associated with 8 crowdsourced simplifications that focus on only lexical
+paraphrasing (no sentence splitting or deletion).
+https://cocoxu.github.io/publications/tacl2016-smt-simplification.pdf
+"""
+from lm_eval.base import PromptSourceTask
+_CITATION = """
+@article{DBLP:journals/corr/abs-2005-00481,
+  author    = {Fernando Alva{-}Manchego and
+               Louis Martin and
+               Antoine Bordes and
+               Carolina Scarton and
+               Beno{\^{\i}}t Sagot and
+               Lucia Specia},
+  title     = {{ASSET:} {A} Dataset for Tuning and Evaluation of Sentence Simplification
+               Models with Multiple Rewriting Transformations},
+  journal   = {CoRR},
+  volume    = {abs/2005.00481},
+  year      = {2020},
+  url       = {https://arxiv.org/abs/2005.00481},
+  eprinttype = {arXiv},
+  eprint    = {2005.00481},
+  timestamp = {Thu, 14 Oct 2021 16:38:25 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2005-00481.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}"""
+""""@article{Xu-EtAl:2016:TACL,
+ author = {Wei Xu and Courtney Napoles and Ellie Pavlick and Quanze Chen and Chris Callison-Burch},
+ title = {Optimizing Statistical Machine Translation for Text Simplification},
+ journal = {Transactions of the Association for Computational Linguistics},
+ volume = {4},
+ year = {2016},
+ url = {https://cocoxu.github.io/publications/tacl2016-smt-simplification.pdf},
+ pages = {401--415}
+ }"""
+class AssetTurk(PromptSourceTask):
+    VERSION = 0
+    DATASET_PATH = "GEM/wiki_auto_asset_turk"
+    DATASET_NAME = None
+    SPLIT = None
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        if self.has_training_docs():
+            if self._training_docs is None:
+                self._training_docs = list(self.dataset["train"])
+            return self._training_docs
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+    def test_docs(self):
+        return self.dataset[str(self.SPLIT)]
+    def stopping_criteria(self):
+        return None
+    def max_generation_length(self):
+        return 200
+    # def higher_is_better(self):
+    #     return {"bleu": True, "rouge": True}
+class AssetTest(AssetTurk):
+    SPLIT = "test_asset"
+class TurkTest(AssetTurk):
+    SPLIT = "test_turk"
+class AssetTest1(AssetTurk):
+    SPLIT = "challenge_test_asset_backtranslation"
+class AssetTest2(AssetTurk):
+    SPLIT = "challenge_test_asset_bfp02"
+class AssetTest3(AssetTurk):
+    SPLIT = "challenge_test_asset_bfp05"
+class AssetTest4(AssetTurk):
+    SPLIT = "challenge_test_asset_nopunc"
+class TurkTest1(AssetTurk):
+    SPLIT = "challenge_test_turk_backtranslation"
+class TurkTest2(AssetTurk):
+    SPLIT = "challenge_test_turk_bfp02"
+class TurkTest3(AssetTurk):
+    SPLIT = "challenge_test_turk_bfp05"
+class TurkTest4(AssetTurk):
+    SPLIT = "challenge_test_turk_nopunc"
+ASSET_TURK_CLASSES = [
+    AssetTest,
+    TurkTest,
+    TurkTest1,
+    TurkTest2,
+    TurkTest3,
+    TurkTest4,
+    AssetTest1,
+    AssetTest2,
+    AssetTest3,
+    AssetTest4,
+]
+def construct_tasks():
+    tasks = {}
+    for asset_turk_class in ASSET_TURK_CLASSES:
+        tasks[f"GEM/wiki_auto_asset_turk_{asset_turk_class.SPLIT}"] = asset_turk_class
+    return tasks
--- a/lm_eval/tasks/gem_webnlg.py
+++ b/lm_eval/tasks/gem_webnlg.py
+"""
+The 2020 Bilingual, Bi-Directional WebNLG+ Shared Task:
+Overview and Evaluation Results (WebNLG+ 2020)
+https://aclanthology.org/2020.webnlg-1.7/
+WebNLG+ offers two challenges: (i) mapping sets of RDF triples
+to English or Russian text (generation) and (ii) converting
+English or Russian text to sets of RDF triples (semantic parsing).
+Compared to the eponymous WebNLG challenge, WebNLG+ provides an
+extended dataset that enable the training, evaluation, and
+comparison of microplanners and semantic parsers. In this paper,
+we present the results of the generation and semantic parsing
+task for both English and Russian and provide a brief
+description of the participating systems.
+"""
 from lm_eval.base import PromptSourceTask
+_CITATION = """
+@inproceedings{castro-ferreira-etal-2020-2020,
+    title = "The 2020 Bilingual, Bi-Directional {W}eb{NLG}+ Shared Task: Overview and Evaluation Results ({W}eb{NLG}+ 2020)",
+    author = "Castro Ferreira, Thiago  and
+      Gardent, Claire  and
+      Ilinykh, Nikolai  and
+      van der Lee, Chris  and
+      Mille, Simon  and
+      Moussallem, Diego  and
+      Shimorina, Anastasia",
+    booktitle = "Proceedings of the 3rd International Workshop on Natural Language Generation from the Semantic Web (WebNLG+)",
+    month = "12",
+    year = "2020",
+    address = "Dublin, Ireland (Virtual)",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.webnlg-1.7",
+    pages = "55--76",
+    abstract = "WebNLG+ offers two challenges: (i) mapping sets of RDF triples to English or Russian text (generation) and (ii) converting English or Russian text to sets of RDF triples (semantic parsing). Compared to the eponymous WebNLG challenge, WebNLG+ provides an extended dataset that enable the training, evaluation, and comparison of microplanners and semantic parsers. In this paper, we present the results of the generation and semantic parsing task for both English and Russian and provide a brief description of the participating systems.",
+}
+"""
 class WebNLG(PromptSourceTask):
    VERSION = 0
    DATASET_PATH = "GEM/web_nlg"
    DATASET_NAME = "en"
+    SPLIT = None
    def has_training_docs(self):
        return False
@@ -27,11 +65,71 @@ class WebNLG(PromptSourceTask):
    def test_docs(self):
        if self.has_test_docs():
-            return self.dataset["test"]
+            if self.SPLIT is not None:
+                return self.dataset[str(self.SPLIT)]
+            else:
+                return self.dataset["test"]
    def stopping_criteria(self):
-        return '*'
+        return None
    def max_generation_length(self):
        return 250
+    # def higher_is_better(self):
+    #     return {"bleu": True, "rouge": True}
+class WebNLGRu(WebNLG):
+    DATASET_NAME = "ru"
+## En Challenge Sets
+class WebNLGEn1(WebNLG):
+    SPLIT = "challenge_validation_sample"
+class WebNLGEn2(WebNLG):
+    SPLIT = "challenge_test_scramble"
+class WebNLGEn3(WebNLG):
+    SPLIT = "challenge_test_numbers"
+## Ru Challenge sets
+class WebNLGRu1(WebNLG):
+    DATASET_NAME = "ru"
+    SPLIT = "challenge_validation_sample"
+class WebNLGRu2(WebNLG):
+    DATASET_NAME = "ru"
+    SPLIT = "challenge_test_scramble"
+WEBNLG_CLASSES = [
+    WebNLG,
+    WebNLGRu,
+    WebNLGEn1,
+    WebNLGEn2,
+    WebNLGEn3,
+    WebNLGRu1,
+    WebNLGRu2,
+]
+def construct_tasks():
+    tasks = {}
+    for webnlg_class in WEBNLG_CLASSES:
+        if webnlg_class.SPLIT is None:
+            tasks[f"GEM/web_nlg_{webnlg_class.DATASET_NAME}"] = webnlg_class
+        else:
+            tasks[
+                f"GEM/web_nlg_{webnlg_class.DATASET_NAME}_{webnlg_class.SPLIT}"
+            ] = webnlg_class
+    return tasks
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -305,3 +305,39 @@ class SGWinogradSchemaChallenge(PromptSourceTask):
    def aggregation(self):
        return {"acc": mean}
+class WinogenderSchemaDiagnostics(PromptSourceTask):
+    VERSION = 0
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "axg"
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return False 
+    def has_test_docs(self):
+        return True 
+    def test_docs(self):
+        return self.dataset["test"]
+class BroadcoverageDiagnostics(PromptSourceTask):
+    VERSION = 0
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "axb"
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return False 
+    def has_test_docs(self):
+        return True 
+    def test_docs(self):
+        return self.dataset["test"]
--- a/lm_eval/tasks/wino_bias.py
+++ b/lm_eval/tasks/wino_bias.py
+"""
+Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods
+https://arxiv.org/abs/1804.06876
+Winograd-schema evaluation of gendered coreference resolution.
+The dataset contains pro-stereotypical and anti-stereotypical parts. The difference in accuracy for those two subsets
+quatnifies bias.
+Homepage: https://uclanlp.github.io/corefBias/overview
+"""
+from lm_eval.base import PromptSourceTask, mean
+import transformers.data.metrics.squad_metrics as squad_metrics
+_CITATION = """
+@inproceedings{zhao-etal-2018-gender,
+    title = "Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods",
+    author = "Zhao, Jieyu  and
+      Wang, Tianlu  and
+      Yatskar, Mark  and
+      Ordonez, Vicente  and
+      Chang, Kai-Wei",
+    booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)",
+    month = jun,
+    year = "2018",
+    address = "New Orleans, Louisiana",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/N18-2003",
+    doi = "10.18653/v1/N18-2003",
+    pages = "15--20",
+    abstract = "In this paper, we introduce a new benchmark for co-reference resolution focused on gender bias, WinoBias. Our corpus contains Winograd-schema style sentences with entities corresponding to people referred by their occupation (e.g. the nurse, the doctor, the carpenter). We demonstrate that a rule-based, a feature-rich, and a neural coreference system all link gendered pronouns to pro-stereotypical entities with higher accuracy than anti-stereotypical entities, by an average difference of 21.1 in F1 score. Finally, we demonstrate a data-augmentation approach that, in combination with existing word-embedding debiasing techniques, removes the bias demonstrated by these systems in WinoBias without significantly affecting their performance on existing datasets.",
+}
+"""
+class WinoBias(PromptSourceTask):
+    VERSION = 0
+    DATASET_PATH = "wino_bias"
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        pass
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def test_docs(self):
+        return self.dataset["test"]
+    def stopping_criteria(self):
+        return "\n"
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        target = self.doc_to_target(doc).strip()
+        pred = " ".join(results[0].strip().split(" ")[:len(target.split(" "))])
+        # The original paper uses F1. In the case of exactly one predicted and one gold mention,
+        # F1 and exact match are equivalent.
+        em = squad_metrics.compute_exact(target, pred)
+        out = {"em": em}
+        if self.save_examples:
+            example = {"target": target, "pred": pred}
+            return out, example
+        return out
+    def aggregation(self):
+        """
+        :returns: {str: [metric_score] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metric scores
+        """
+        return {'em': mean}
+    def higher_is_better(self):
+        return {'em': True}
+class WinoBiasType1Pro(WinoBias):
+    DATASET_NAME = "type1_pro"
+class WinoBiasType1Anti(WinoBias):
+    DATASET_NAME = "type1_anti"
+class WinoBiasType2Pro(WinoBias):
+    DATASET_NAME = "type2_pro"
+class WinoBiasType2Anti(WinoBias):
+    DATASET_NAME = "type2_anti"
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -11,14 +11,14 @@ EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
 def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument('--output_base_path', required=True)
+    parser.add_argument("--output_base_path", required=True)
-    parser.add_argument('--tasks', default="all_tasks")
+    parser.add_argument("--tasks", default="all_tasks")
-    parser.add_argument('--provide_description', action="store_true")
+    parser.add_argument("--provide_description", action="store_true")
-    parser.add_argument('--sets', type=str, default="val") # example: val,test
+    parser.add_argument("--sets", type=str, default="val")  # example: val,test
-    parser.add_argument('--num_fewshot', type=int, default=1)
+    parser.add_argument("--num_fewshot", type=int, default=1)
-    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument('--num_examples', type=int, default=1)
+    parser.add_argument("--num_examples", type=int, default=1)
-    parser.add_argument('--description_dict_path', default=None)
+    parser.add_argument("--description_dict_path", default=None)
    return parser.parse_args()
@@ -34,7 +34,7 @@ def main():
    description_dict = {}
    if args.description_dict_path:
-        with open(args.description_dict_path, 'r') as f:
+        with open(args.description_dict_path, "r") as f:
            description_dict = json.load(f)
    os.makedirs(args.output_base_path, exist_ok=True)
@@ -45,26 +45,34 @@ def main():
        iters = []
        for set in args.sets.split(","):
-            if set == 'train' and task.has_training_docs():
+            if set == "train" and task.has_training_docs():
                docs = task.training_docs()
-            if set == 'val' and task.has_validation_docs():
+            if set == "val" and task.has_validation_docs():
                docs = task.validation_docs()
-            if set == 'test' and task.has_test_docs():
+            if set == "test" and task.has_test_docs():
                docs = task.test_docs()
            iters.append(docs)
        docs = join_iters(iters)
-        description = description_dict[task_name] if description_dict and task_name in description_dict else ""
+        description = (
-        task_name = task_name.replace('/','_')
+            description_dict[task_name]
+            if description_dict and task_name in description_dict
+            else ""
+        )
+        task_name = task_name.replace("/", "_")
        with open(os.path.join(args.output_base_path, task_name), "w") as f:
-            for i, doc in zip(range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs):
+            for i, doc in (
+                zip(range(args.num_examples), docs)
+                if args.num_examples > 0
+                else enumerate(docs)
+            ):
                f.write(EXAMPLE_DIVIDER.format(i=i))
                ctx, _ = task.fewshot_context(
                    doc=doc,
                    num_fewshot=args.num_fewshot,
                    rnd=rnd,
-                    description=description
+                    description=description,
                )
                f.write(ctx + "\n")

--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ setuptools.setup(
        "click>=7.1",
        "scikit-learn>=0.24.1",
        "torch>=1.7",
-        "transformers>=4.1",
+        "transformers>=4.16",
        "sqlitedict==1.6.0",
        "pytablewriter==0.58.0",
        "sacrebleu==1.5.0",