add drop data + doc2text

ec4d3615 · Anish Thite · 5888a695 · ec4d3615 · ec4d3615
Commit ec4d3615 authored Oct 04, 2020 by Anish Thite
Hide whitespace changes
Inline Side-by-side

Showing with 84 additions and 1 deletion

download_all.sh download_all.sh +9 -1

lm_eval/tasks/drop.py lm_eval/tasks/drop.py +75 -0

No files found.
--- a/download_all.sh
+++ b/download_all.sh
@@ -3,4 +3,12 @@
 #coqa
 mkdir -p data/coqa
 wget http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json -O data/coqa/coqa-train-v1.0.json
-wget http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json -O data/coqa/coqa-dev-v1.0.json
\ No newline at end of file
+wget http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json -O data/coqa/coqa-dev-v1.0.json
+
+#drop
+mkdir -p data/drop
+wget https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip -O data/drop.zip
+unzip data/drop.zip -d data/drop
+rm data/drop.zip
+mv data/drop/drop_dataset/* data/drop
+rm -rf data/drop/drop_dataset
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
+import numpy as np
+import json
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import NLP_TASK, simple_accuracy_metric, yesno
+from pathlib import Path
+from ..base import Dataset
+
+class DROP(Dataset):
+    DATAFOLDER = Path(__file__).parent / "../../data/drop"
+    
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        return True
+    
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        return True
+
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        return False
+
+    def training_docs(self):
+        docs = json.load(open(self.DATAFOLDER / 'drop_dataset_train.json'))
+        return [docs[k] for k in docs.keys()]
+
+
+    def validation_docs(self):
+        docs = json.load(open(self.DATAFOLDER / 'drop_dataset_dev.json'))
+        return [docs[k] for k in docs.keys()]
+    
+    def test_docs(self):
+        pass
+    
+    def doc_to_text(self, doc, include_target=True):
+        doctext = "Passage: {}\n\n".format(doc["passage"])
+        qa_texts = []
+        for pair in doc["qa_pairs"]:
+            text = ''.join(['Q: ', pair['question'],'\nA: '])
+            if include_target:
+                def get_answer(ans_dict):
+                    if ans_dict['number'] != '':
+                        return ans_dict['number']
+                    if ans_dict['spans'] != []:
+                        if len(ans_dict['spans']) > 0:
+                            return ', '.join(ans_dict['spans'])
+                        return ans_dict['spans'][0]
+                    return ' '.join([ans_dict['date']['day'], 
+                                     ans_dict['date']['month'], 
+                                     ans_dict['date']['year']]).strip() 
+                text = ''.join([text, get_answer(pair['answer'])])
+            qa_texts.append(text)
+        return ''.join([doctext, '\n\n'.join(qa_texts)])
+            
+    
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        """Take iterable of docs and evaluates, returning a dict with the following format:
+
+        {
+            "major": float,
+            "minor": dict,
+            "higher_is_better": bool,
+        }
+
+        * `major` should be a single, representative number, for programmatic comparison
+        * `minor` should be a dictionary containing all relevant sub-metrics
+        * `higher_is_better` determines whether a higher metric is better
+        """
+        pass
+
+    def fewshot_description(self):
+        return "Read the passage and answer the questions "
+