"vscode:/vscode.git/clone" did not exist on "fcab73a9b60f515f63b0671d191040dd0103bff7"
Commit c013679d authored by Charles Foster's avatar Charles Foster
Browse files

Fixes to natural questions.

parent dc3560d0
...@@ -20,20 +20,22 @@ class NaturalQs(HFTask): ...@@ -20,20 +20,22 @@ class NaturalQs(HFTask):
def training_docs(self): def training_docs(self):
# Cache training for faster few-shot. # Cache training for faster few-shot.
# Data is too large to fit in memory. # Data is too large to fit in memory.
return self.data["train"] return self.data["train"]
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
question = doc['question']['text'] question = doc['question']['text']
short_answer = doc['annotations']['short_answers'][0]['text']
long_answer_start = doc['annotations']['long_answer'][0]['start_token']
long_answer_end = doc['annotations']['long_answer'][0]['end_token']
long_answer = " ".join(doc['document']['tokens']['token'][long_answer_start:long_answer_end])
text = 'Q: ' + question + '\n\n' + 'A: ' text = 'Q: ' + question + '\n\n' + 'A: '
if include_target: if include_target:
# There's a short answer and a long answer. Based on the paper, I'm using the long answer. # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
short_answer = doc['annotations']['short_answers'][0]['text']
long_answer_start = doc['annotations']['long_answer'][0]['start_token']
long_answer_end = doc['annotations']['long_answer'][0]['end_token']
long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
long_answer = " ".join(long_answer_chars)
text += long_answer # Replace with short_answer[0] for short answer text += long_answer # Replace with short_answer[0] for short answer
return text return text
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment