drop.py

import numpy as np
import json
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFNLPTask, simple_accuracy_metric, yesno
from pathlib import Path
from ..base import Dataset

class DROP(Dataset):
    DATAFOLDER = Path(__file__).parent / "../../data/drop"
    
    def has_training_docs(self):
        """Whether the task has a training set"""
        return True
    
    def has_validation_docs(self):
        """Whether the task has a validation set"""
        return True

    def has_test_docs(self):
        """Whether the task has a test set"""
        return False

    def training_docs(self):
        docs = json.load(open(self.DATAFOLDER / 'drop_dataset_train.json'))
        return [docs[k] for k in docs.keys()]


    def validation_docs(self):
        docs = json.load(open(self.DATAFOLDER / 'drop_dataset_dev.json'))
        return [docs[k] for k in docs.keys()]
    
    def test_docs(self):
        pass
    
    def doc_to_text(self, doc, include_target=True):
        doctext = "Passage: {}\n\n".format(doc["passage"])
        qa_texts = []
        for pair in doc["qa_pairs"]:
            text = ''.join(['Q: ', pair['question'],'\nA: '])
            if include_target:
                def get_answer(ans_dict):
                    if ans_dict['number'] != '':
                        return ans_dict['number']
                    if ans_dict['spans'] != []:
                        if len(ans_dict['spans']) > 0:
                            return ', '.join(ans_dict['spans'])
                        return ans_dict['spans'][0]
                    return ' '.join([ans_dict['date']['day'], 
                                     ans_dict['date']['month'], 
                                     ans_dict['date']['year']]).strip() 
                text = ''.join([text, get_answer(pair['answer'])])
            qa_texts.append(text)
        return ''.join([doctext, '\n\n'.join(qa_texts)])
            
    
    def evaluate(self, docs, lm, provide_description, num_fewshot):
        """Take iterable of docs and evaluates, returning a dict with the following format:

        {
            "major": float,
            "minor": dict,
            "higher_is_better": bool,
        }

        * `major` should be a single, representative number, for programmatic comparison
        * `minor` should be a dictionary containing all relevant sub-metrics
        * `higher_is_better` determines whether a higher metric is better
        """
        pass

    def fewshot_description(self):
        return "Read the passage and answer the questions "