Commit f48b119d authored by Charles Foster's avatar Charles Foster
Browse files

Skeleton of SQuADv2. Not yet tested.

parent 3d432b1a
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
import datasets
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
from lm_eval.base import rf, f1_score, mean
from . common import HFTask
class SQuAD(HFTask):
DATASET_PATH = "squad_v2"
......@@ -18,16 +18,14 @@ class SQuAD(HFTask):
return False
def training_docs(self):
if self.has_training_docs():
return self.data["train"]
return self.data["train"]
def validation_docs(self):
if self.has_validation_docs():
return self.data["validation"]
return self.data["validation"]
def fewshot_description(self):
# TODO: redo description
return "Title: The_Title_of_It\n\nBackground: A text passage as background to answer the question with.\n\nQ: Question about the passage.\n\nA: Answer."
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
......@@ -51,8 +49,8 @@ class SQuAD(HFTask):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
continuation = rf.greedy_until(ctx, ['\n'])
return continuation
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
......@@ -64,8 +62,21 @@ class SQuAD(HFTask):
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
squad_metric = datasets.load_metric("squad_v2")
predictions = {
'id': doc['id'],
'prediction_text': results[0],
}
references = {
'id': doc['id'],
'answers': doc['answers'],
}
metrics = squad_metric.compute(predictions=predictions, references=references)
return metrics
def aggregation(self):
"""
......@@ -73,8 +84,21 @@ class SQuAD(HFTask):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
return {
'exact': mean, # Exact match (the normalized answer exactly match the gold answer)
'f1': mean, # The F-score of predicted tokens versus the gold answer
'total': mean, # Number of score considered
'HasAns_exact': mean, # Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1': mean, # The F-score of predicted tokens versus the gold answer
'HasAns_total': mean, # Number of score considered
'NoAns_exact': mean, # Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1': mean, # The F-score of predicted tokens versus the gold answer
'NoAns_total': mean, # Number of score considered
'best_exact': mean, # Best exact match (with varying threshold)
'best_exact_thresh': mean, # No-answer probability threshold associated to the best exact match
'best_f1': mean, # Best F1 (with varying threshold)
'best_f1_thresh': mean, # No-answer probability threshold associated to the best F1
}
def higher_is_better(self):
"""
......@@ -82,5 +106,18 @@ class SQuAD(HFTask):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
return {
'exact': True, # Exact match (the normalized answer exactly match the gold answer)
'f1': True, # The F-score of predicted tokens versus the gold answer
'total': None, # Number of score considered
'HasAns_exact': True, # Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1': True, # The F-score of predicted tokens versus the gold answer
'HasAns_total': None, # Number of score considered
'NoAns_exact': True, # Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1': True, # The F-score of predicted tokens versus the gold answer
'NoAns_total': None, # Number of score considered
'best_exact': True, # Best exact match (with varying threshold)
'best_exact_thresh': None, # No-answer probability threshold associated to the best exact match
'best_f1': True, # Best F1 (with varying threshold)
'best_f1_thresh': None, # No-answer probability threshold associated to the best F1
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment