"vscode:/vscode.git/clone" did not exist on "1c0a1f8e140765abb068a4c1e9424f76d037c0e6"
naturalqs.py 1.67 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from . common import HFTask

class NaturalQs(HFTask):
    DATASET_PATH = "natural_questions"
    DATASET_NAME = None

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return False

    def fewshot_description(self):
        # TODO: figure out description
        return ""

20
21
22
    def training_docs(self):
        # Cache training for faster few-shot.
        # Data is too large to fit in memory.
Charles Foster's avatar
Charles Foster committed
23
        return self.data["train"]
24

25
26
27
28
29
30
    def doc_to_text(self, doc, include_target=True):
        question = doc['question']['text']
        
        text = 'Q: ' + question + '\n\n' + 'A: '

        if include_target:
31
            # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
Charles Foster's avatar
Charles Foster committed
32
33
34
35
36
37
38
            short_answer = doc['annotations']['short_answers'][0]['text']
            long_answer_start = doc['annotations']['long_answer'][0]['start_token']
            long_answer_end = doc['annotations']['long_answer'][0]['end_token']
            long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
            long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
            long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
            long_answer = " ".join(long_answer_chars)
39
            text += long_answer # Replace with short_answer[0] for short answer
40
41
42
43
44
45

        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: implement
        raise NotImplementedError()