Merge pull request #3 from EleutherAI/master

Sync up to EAI

Merge pull request #3 from EleutherAI/master
Sync up to EAI
4a294d8a · Charles Foster · GitHub · 302ca3d6 · 946cb2bc · 4a294d8a
Unverified Commit 4a294d8a authored Oct 25, 2020 by Charles Foster Committed by GitHub Oct 25, 2020
12 changed files
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -4,16 +4,19 @@ from . import arc
 from . import race
 from . import webqs
 from . import anli
+from . import wsc273
 from . import winogrande
 from . import quac
 from . import hellaswag
 from . import openbookqa
 from . import squad
+from . import naturalqs

 TASK_REGISTRY = {
    # GLUE
    "cola": glue.CoLA,
    "mnli": glue.MNLI,
+    "mnli_mismatched": glue.MNLIMismatched,
    "mrpc": glue.MRPC,
    "rte": glue.RTE,
    "qnli": glue.QNLI,
@@ -27,7 +30,7 @@ TASK_REGISTRY = {
    "copa": superglue.Copa,
    "multirc": superglue.MultiRC,
    "wic": superglue.WordsInContext,
-    "wsc": superglue.WinogradSchemaChallenge,
+    "wsc": superglue.SGWinogradSchemaChallenge,
    # Order by benchmark/genre?
    "arc_easy": arc.ARCEasy,
    "arc_challenge": arc.ARCChallenge,
@@ -36,7 +39,9 @@ TASK_REGISTRY = {
    "openbookqa": openbookqa.OpenBookQA,
    "squad": squad.SQuAD,
    "race": race.RACE,
+    "naturalqs": naturalqs.NaturalQs,
    "webqs": webqs.WebQs,
+    "wsc273": wsc273.WinogradSchemaChallenge273,
    "winogrande": winogrande.Winogrande,
    "anli_r1": anli.ANLIRound1,
    "anli_r2": anli.ANLIRound2,

--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
@@ -5,8 +5,10 @@ from ..utils import sh


 class CoQA(Dataset):
-
+    def __init__(self):
+        self.download()
    def download(self):
+        #TODO: don't download if files already there
        sh("""
            mkdir -p data/coqa 
            wget http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json -O data/coqa/coqa-train-v1.0.json
@@ -17,48 +19,34 @@ class CoQA(Dataset):
        return True

    def has_validation_docs(self):
-        return False
+        return True

    def has_test_docs(self):
        return False

    def training_docs(self):
-        myjson = json.load(open('data/coqa/coqa-train-v1.0.json'))['data']
-        return self.load_doc(myjson)
+        return json.load(open('data/coqa/coqa-train-v1.0.json'))['data']

    def validation_docs(self):
-        pass
+        return  json.load(open('data/coqa/coqa-dev-v1.0.json'))['data']  

    def test_docs(self):
-        myjson = json.load(open('data/coqa/coqa-dev-v1.0.json'))['data']    
-        return self.load_doc(myjson)
-    
-    def fewshot_examples(self, k):
-        traindocs = list(self.training_docs())
-        random.seed(123)
-        random.shuffle(traindocs)
-
-        return traindocs[:k]
+        pass   
    
    def fewshot_description(self):
        pass
-
-    def load_doc(self, myjson):
-        docs = []
-        for item in myjson:
-            new_instance = [item['story']]
-            qa_pairs = zip(item['questions'], item['answers'])
-            for pair in qa_pairs:
-                new_instance.append('\n')
-                new_instance.append(''.join(['Q: ',pair[0]['input_text']]))
-                new_instance.append(''.join(['A: ',pair[1]['input_text']]))
-            docs.append(new_instance)  
-        return docs
    
    def doc_to_text(self, doc, include_target=True):
-        text = '\n<|endoftext|>\n'.join(['\n'.join(instance) for instance in doc])
-        text = text + '\n<|endoftext|>'
-        return text
+        text = [doc['story']]
+        for pair in zip(doc['questions'], doc['answers']):
+            text.append('\n\n')
+            text.append(''.join(['Q: ',pair[0]['input_text'], '\n\n']))
+            if include_target:
+                text.append(''.join(['A: ',pair[1]['input_text']]))
+            else:
+                text.append('A: ')
+
+        return ''.join(text)

    def evaluate(self, docs, lm):
        pass
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -10,6 +10,9 @@ from ..base import Dataset
 class DROP(Dataset):
    DATAFOLDER = Path(__file__).parent / "../../data/drop"
    
+    def __init__(self):
+        self.download()
+
    def has_training_docs(self):
        """Whether the task has a training set"""
        return True
@@ -35,10 +38,10 @@ class DROP(Dataset):
        pass
    
    def doc_to_text(self, doc, include_target=True):
-        doctext = "Passage: {}\n\n".format(doc["passage"])
+        doctext = "Passage: {}\n".format(doc["passage"])
        qa_texts = []
        for pair in doc["qa_pairs"]:
-            text = ''.join(['Q: ', pair['question'],'\nA: '])
+            text = ''.join(['Question: ', pair['question'],'\nAnswer: '])
            if include_target:
                def get_answer(ans_dict):
                    if ans_dict['number'] != '':
@@ -52,7 +55,7 @@ class DROP(Dataset):
                                     ans_dict['date']['year']]).strip() 
                text = ''.join([text, get_answer(pair['answer'])])
            qa_texts.append(text)
-        return ''.join([doctext, '\n\n'.join(qa_texts)])
+        return ''.join([doctext, '\n'.join(qa_texts)])
            
    
    def evaluate(self, docs, lm, provide_description, num_fewshot):

--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -114,6 +114,17 @@ class MNLI(HFTask):
        return simple_accuracy_metric(preds=preds, golds=golds)


+class MNLIMismatched(MNLI):
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.data["validation_mismatched"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.data["test_mismatched"]
+
+
 class MRPC(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "mrpc"

--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
@@ -6,7 +6,8 @@ import ftfy


 class Lambada(Dataset):
-
+    def __init__(self):
+        self.download()
    def download(self):
        sh("mkdir -p data/lambada")
        with open("data/lambada/lambada_test.json", 'w') as f:
@@ -32,14 +33,16 @@ class Lambada(Dataset):
        pass

    def load_doc(self, myjson):
-        return [doc['text'] for doc in myjson]
+        return [doc for doc in myjson]

    def test_docs(self):
        myjson = json.load(open("data/lambada/lambada_test.json"))
        return self.load_doc(myjson)

    def doc_to_text(self, doc, include_target=True):
-        pass
+        #TODO: check if this is how OA does it
+        #label = doc[]
+        return doc

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        pass
\ No newline at end of file
--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
+from . common import HFTask
+
+class NaturalQs(HFTask):
+    DATASET_PATH = "natural_questions"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+
+    def training_docs(self):
+        # Cache training for faster few-shot.
+        # Data is too large to fit in memory.
+        return self.data["train"]
+
+    def doc_to_text(self, doc, include_target=True):
+        question = doc['question']['text']
+        
+        text = 'Q: ' + question + '\n\n' + 'A: '
+
+        if include_target:
+            # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
+            short_answer = doc['annotations']['short_answers'][0]['text']
+            long_answer_start = doc['annotations']['long_answer'][0]['start_token']
+            long_answer_end = doc['annotations']['long_answer'][0]['end_token']
+            long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
+            long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
+            long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
+            long_answer = " ".join(long_answer_chars)
+            text += long_answer # Replace with short_answer[0] for short answer
+
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: implement
+        raise NotImplementedError()
\ No newline at end of file
--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
+import json
+import random
+from lm_eval.base import Dataset
+from ..utils import sh
+
+class PiQA(Dataset):
+    def __init__(self):
+        self.download()
+    def download(self):
+        #pass
+        #TODO: don't download if files already there
+        sh("""
+           mkdir -p data/piqa
+           wget https://yonatanbisk.com/piqa/data/train.jsonl -O data/piqa/piqa-train.jsonl
+           wget https://yonatanbisk.com/piqa/data/train-labels.lst -O data/piqa/piqa-train-labels.lst
+           wget https://yonatanbisk.com/piqa/data/valid.jsonl -O data/piqa/piqa-valid.jsonl
+           wget https://yonatanbisk.com/piqa/data/valid-labels.lst -O data/piqa/piqa-valid-labels.lst
+           wget https://yonatanbisk.com/piqa/data/tests.jsonl -O data/piqa/piqa-test.jsonl
+           """)
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def load_docs(self, textfilename, labelfilename):
+        if labelfilename != None:
+            return zip([json.loads(entry) for entry in list(open(textfilename,'r'))],list(open(labelfilename, 'r')))
+        else:
+            return [json.loads(entry) for entry in list(open(textfilename,'r'))]
+    
+    def training_docs(self):
+        return self.load_docs('data/piqa/piqa-train.jsonl', 'data/piqa/piqa-train-labels.lst')
+   
+    def validation_docs(self):
+        return self.load_docs('data/piqa/piqa-valid.jsonl', 'data/piqa/piqa-valid-labels.lst')
+
+    def test_docs(self):
+        return self.load_docs('data/piqa/piqa-test.jsonl', None)
+    
+    def fewshot_description(self):
+        pass
+    
+    def doc_to_text(self, doc, include_target=True):
+        if include_target:
+            rightanswer = int(doc[1][0])+1
+            return ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])
+        #TODO: check if oa uses newline
+        return  doc['goal'] + ' '
+
+    def evaluate(self, docs, lm):
+        pass
+
--- a/lm_eval/tasks/storycloze.py
+++ b/lm_eval/tasks/storycloze.py
+import json
+import random
+from lm_eval.base import Dataset
+from ..utils import sh
+import csv
+
+class StoryCloze(Dataset):
+    def __init__(self):
+        self.download()
+    def download(self):
+        #TODO: replace with Eye link
+        pass
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        pass
+
+    def load_doc(self, filename):
+        with open(filename, newline='') as file:
+            filereader = csv.reader(file)
+            return list(filereader)
+                
+
+    def validation_docs(self):
+        return  self.load_doc("data/storycloze/cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv")
+
+    def test_docs(self):
+        return self.load_doc("data/storycloze/cloze_test_test__winter2018-cloze_test_ALL_test - 1.csv")
+
+    
+    def fewshot_description(self):
+        pass
+    
+    def doc_to_text(self, doc, include_target=True):
+        if include_target:
+            return ' '.join([*doc[1:5],doc[int(doc[-1])-4]]) 
+        else:
+            return ' '.join([*doc[1:5]])
+
+    def evaluate(self, docs, lm):
+        pass
+
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -218,7 +218,7 @@ class WordsInContext(HFTask):
        return simple_accuracy_metric(preds=preds, golds=golds)


-class WinogradSchemaChallenge(HFTask):
+class SGWinogradSchemaChallenge(HFTask):
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wsc"

@@ -282,3 +282,25 @@ class WinogradSchemaChallenge(HFTask):
            )
            preds.append(1 if generated == to_predict else 0)
        return simple_accuracy_metric(preds=preds, golds=golds)
+
+class RTE(HFTask):
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "rte"
+
+    def fewshot_description(self):
+        #TODO: implement
+        pass
+
+    def doc_to_text(self, doc, include_target=True):
+        if include_target:
+            if doc['label'] == 0:
+                answer = 'True'
+            else:
+                answer = 'False'
+            return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ', answer])
+        else:
+            return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: '])
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        #TODO: 
+        pass
+
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
+import json
+import random
+from lm_eval.base import Dataset
+from ..utils import sh
+
+class TriviaQA(Dataset):
+    def __init__(self):
+        self.download()
+    def download(self):
+        #pass
+        #TODO: don't download if files already there
+        sh("""
+           mkdir -p data/triviaqa
+           wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz -O data/triviaqa/trivia_qa-unfiltered.tar.gz
+           tar -xf data/triviaqa/trivia_qa-unfiltered.tar.gz
+           mv triviaqa-unfiltered/ data/triviaqa/
+           """)
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-train.json'))['Data']
+
+    def validation_docs(self):
+        return  json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-dev.json'))['Data']
+
+    def test_docs(self):
+        return  json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-test.json'))['Data']     
+    
+    def fewshot_description(self):
+        pass
+    
+    def doc_to_text(self, doc, include_target=True):
+        if include_target:
+            return ''.join(['Q: ', doc['Question'], '\n\n','A: ', doc['Answer']['Aliases'][0]])
+        else:
+            return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
+    def evaluate(self, docs, lm):
+        pass
+
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import NLP_TASK, simple_accuracy_metric, yesno
+
+class WikiText103(NLP_TASK):
+    NLP_PATH = "wikitext"
+    NLP_NAME = "wikitext-103-raw-v1"
+
+    def fewshot_description(self):
+        return ""
+
+    def doc_to_text(self, doc, include_target=True):
+        return doc['text']
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        pass
+
+
+class WikiText2(NLP_TASK):
+    NLP_PATH = "wikitext"
+    NLP_NAME = "wikitext-2-raw-v1"
+
+    def fewshot_description(self):
+        return ""
+
+    def doc_to_text(self, doc, include_target=True):
+        return doc['text']
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        pass
\ No newline at end of file
--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
+import json
+import random
+import os
+from lm_eval.base import Dataset
+from ..utils import sh
+
+
+class WinogradSchemaChallenge273(Dataset):    
+    def __init__(self):
+        super().__init__()
+
+    def download(self):
+        if not os.path.exists('data/wsc273'):
+            sh("""
+                mkdir -p data/wsc273 
+                wget https://git.cse.msu.edu/bakerb15/nlp-final-project/raw/master/Winogard/reproduce/commonsense_test/wsc273.json -O data/wsc273/wsc273.json
+                """)
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return []
+
+    def validation_docs(self):
+        return []
+
+    def test_docs(self):
+        myjson = json.load(open('data/wsc273/wsc273.json'))
+        return self.load_doc(myjson)
+    
+    def fewshot_description(self):
+        # This format is ONLY for the purposes of deduplication. For the task evaluation, we'll need to find a new strategy,
+        # to meet the needs of this particular task.
+        return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
+
+    def load_doc(self, myjson):
+        docs = []
+        for i in range(0, 273 * 2, 2):
+            item1 = myjson[i]
+            item2 = myjson[i+1]
+
+            if item1['question_id'] != item2['question_id']:
+                raise ValueError("WSC273 has missing completion pair.")
+
+            question_id = item1['question_id']
+
+            if item1['correctness'] == True:
+                doc = {
+                    'id': question_id,
+                    'completions': {
+                        'T': item1['substitution'],
+                        'F': item2['substitution'],
+                    },
+                }
+                
+            if item2['correctness'] == True:
+                doc = {
+                    'id': question_id,
+                    'completions': {
+                        'F': item1['substitution'],
+                        'T': item2['substitution'],
+                    },
+                }
+
+            docs.append(doc)
+ 
+        return docs
+    
+    def doc_to_text(self, doc, include_target=True):
+        # WSC273 is currently only writing out full examples. Partial evaluation needs implementing.
+        text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
+        return text
+
+    def evaluate(self, docs, lm):
+        # TODO: Write evaluation function
+        raise NotImplementedError()