naturalqs.py

import random
from . common import HFTask
from itertools import islice


class NaturalQs(HFTask):
    VERSION = 0
    # TODO: naturalqs has a *really* large train set that huggingface just
    # automatically downloads even if you dont use it. we should try and only 
    # download the val set and not even bother with the train set. 

    DATASET_PATH = "natural_questions"
    DATASET_NAME = None

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return False

    def fewshot_description(self):
        # TODO: figure out description
        return ""

    def training_docs(self):
        # Cache training for faster few-shot.
        # Data is too large to fit in memory.
        return self.data["train"]

    def fewshot_examples(self, k, rnd):
        # Data is too large to fit in memory. We just sample from the first bit.
        if self._training_docs is None:
            self._training_docs = list(islice(self.training_docs(), 0, 100000))

        return rnd.sample(self._training_docs, k)

    def doc_to_text(self, doc):
        return 'Q: ' + doc['question']['text'] + '\n\n' + 'A:'

    def doc_to_target(self, doc):
        # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
        short_answer = doc['annotations']['short_answers'][0]['text']
        long_answer_start = doc['annotations']['long_answer'][0]['start_token']
        long_answer_end = doc['annotations']['long_answer'][0]['end_token']
        long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
        long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
        long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
        long_answer = " ".join(long_answer_chars)
        return long_answer # Replace with short_answer[0] for short answer

    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of 
        Requests which will be sent to the LM.

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param ctx: str
            The context string, generated by fewshot_context. This includes the natural 
            language description, as well as the few shot examples, and the question
            part of the document for `doc`. 
        """
        # TODO: implement evaluation.
        raise NotImplementedError('Evaluation not implemented')
    
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
        dict where keys are the names of submetrics and values are the values of 
        the metric for that one document

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param results:
            The results of the requests created in construct_requests.
        """
        # TODO: implement evaluation.
        raise NotImplementedError('Evaluation not implemented')

    def aggregation(self):
        """
        :returns: {str: [float] -> float}
            A dictionary where keys are the names of submetrics and values are 
            functions that aggregate a list of metrics
        """
        # TODO: implement evaluation.
        raise NotImplementedError('Evaluation not implemented')

    def higher_is_better(self):
        """
        :returns: {str: bool}
            A dictionary where keys are the names of submetrics and values are 
            whether a higher value of the submetric is better
        """
        # TODO: implement evaluation.
        raise NotImplementedError('Evaluation not implemented')