Commit f7aaff08 authored by Stephen Hogg's avatar Stephen Hogg
Browse files

Mark unanswerable as TODO

parent be55ea8c
...@@ -24,6 +24,7 @@ https://arxiv.org/abs/2105.03011 ...@@ -24,6 +24,7 @@ https://arxiv.org/abs/2105.03011
""" """
from collections import Counter from collections import Counter
from math import exp from math import exp
import random
import re import re
import string import string
from lm_eval.base import rf from lm_eval.base import rf
...@@ -157,10 +158,10 @@ class QASPER(HFTask): ...@@ -157,10 +158,10 @@ class QASPER(HFTask):
ll_yes, ll_no, (logprob_unanswerable, _) = results ll_yes, ll_no, (logprob_unanswerable, _) = results
res_dict = {} res_dict = {}
# Handle unanswerability first # TODO: Handle unanswerability first
unanswerable_gold = doc["answer_type"] == "unanswerable" # unanswerable_gold = doc["answer_type"] == "unanswerable"
unanswerable_pred = exp(logprob_unanswerable) > 1 - exp(logprob_unanswerable) # unanswerable_pred = exp(logprob_unanswerable)
res_dict["f1_unanswerable"] = (unanswerable_gold, unanswerable_pred) # res_dict["f1_unanswerable"] = (unanswerable_gold, unanswerable_pred)
# Handle yes/no questions # Handle yes/no questions
if doc["answer_type"] == "bool": if doc["answer_type"] == "bool":
...@@ -179,7 +180,6 @@ class QASPER(HFTask): ...@@ -179,7 +180,6 @@ class QASPER(HFTask):
def aggregation(self): def aggregation(self):
return { return {
"f1_unanswerable": f1_score,
"f1_yesno": f1_score, "f1_yesno": f1_score,
"f1_abstractive": mean, "f1_abstractive": mean,
} }
...@@ -212,7 +212,6 @@ class QASPER(HFTask): ...@@ -212,7 +212,6 @@ class QASPER(HFTask):
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
return { return {
"f1_unanswerable": True,
"f1_yesno": True, "f1_yesno": True,
"f1_abstractive": True, "f1_abstractive": True,
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment