add dataset description

58abbac2 · KhalidAlt · 383318fe · 383318fe · 58abbac2
Commit 58abbac2 authored Apr 28, 2022 by KhalidAlt
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 118 deletions

lm_eval/tasks/TyDiQA.py lm_eval/tasks/TyDiQA.py +0 -110

lm_eval/tasks/lama.py lm_eval/tasks/lama.py +7 -8

No files found.
--- a/lm_eval/tasks/TyDiQA.py
+++ b/lm_eval/tasks/TyDiQA.py
-"""
-Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference
-https://arxiv.org/abs/1902.01007
-
-A controlled evaluation set called HANS (Heuristic Analysis for NLI Systems),
-which contains many examples where the heuristics fail.
-
-Homepage: https://github.com/tommccoy1/hans
-"""
-from lm_eval.base import PromptSourceTask
-
-
-_CITATION = """\
-@article{tydiqa,
-title   = {TyDi QA: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages},
-author  = {Jonathan H. Clark and Eunsol Choi and Michael Collins and Dan Garrette and Tom Kwiatkowski and Vitaly Nikolaev and Jennimaria Palomaki}
-year    = {2020},
-journal = {Transactions of the Association for Computational Linguistics}
-}
-"""
-
-
-class Primary(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "tydiqa"
-    DATASET_NAME = "primary_task"
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return False
-
-    def training_docs(self):
-        if self.has_training_docs():
-            # We cache training documents in `self._training_docs` for faster
-            # few-shot processing. If the data is too large to fit in memory,
-            # return the training data as a generator instead of a list.
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test"]
-    def process_results(self, doc, results):
-        out = {}
-        #gold = doc
-        pred = results[0].strip()
-        print("############")
-        print(self.doc_to_target(doc))
-
-        target = self.doc_to_target(doc)['sub_label']
-        #pred = np.argmax(results)
-        out["acc"] = pred == target
-
-
-        #result = metric.compute(predictions=pred, references=gold)
-        #out['acc'] = {"accuracy": result["score"]}
-        
-        #out['acc'] = 1.0 if pred == gold else 0.0
-        if self.save_examples:
-            example = {
-                "pred": pred,
-                "target": target,
-            }
-            return out, example
-
-        return out
-
-
-class Secondary(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "tydiqa"
-    DATASET_NAME = "secondary_task"
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return False
-
-    def training_docs(self):
-        if self.has_training_docs():
-            # We cache training documents in `self._training_docs` for faster
-            # few-shot processing. If the data is too large to fit in memory,
-            # return the training data as a generator instead of a list.
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test"]
-
-
--- a/lm_eval/tasks/lama.py
+++ b/lm_eval/tasks/lama.py
 """
-Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference
-https://arxiv.org/abs/1902.01007
+https://arxiv.org/abs/1909.01066
+https://arxiv.org/abs/2005.04611
+LAMA is a prob dataset to test the factual and commonsense knowledge in language models The dataset include a subset of 
+Google_RE (https://code.google.com/archive/p/relation-extraction-corpus/), TRex (subset of wikidata triples), 
+Conceptnet (https://github.com/commonsense/conceptnet5/wiki) and Squad. 

-A controlled evaluation set called HANS (Heuristic Analysis for NLI Systems),
-which contains many examples where the heuristics fail.
-
-Homepage: https://github.com/tommccoy1/hans
+Homepage: https://github.com/facebookresearch/LAMA
 """
 from lm_eval.base import PromptSourceTask
 import numpy as np 
 from lm_eval.metrics import mean
-from lm_eval import metrics,utils
-from typing import Iterable, Optional
+from typing import Optional

 _CITATION = """
 @inproceedings{petroni2019language, title={Language Models as Knowledge Bases?},