Commit 6ca56eac authored by JessicaOjo's avatar JessicaOjo
Browse files

add squad metric

parent 187ab735
......@@ -58,6 +58,21 @@ def f1_score(items):
return np.max(fscore)
@register_aggregation("squad_f1")
def squad_f1_score(items):
gold_squad, pred_squad = [], []
for index, (ref, pred) in enumerate(items):
pred_dict = {'prediction_text': pred, 'id': str(index)}
ref_dict = {'answers': {'answer_start': [0], 'text': [ref]}, 'id': str(index)}
gold_squad.append(ref_dict)
pred_squad.append(pred_dict)
squad_metric = hf_evaluate.load("squad")
results_squad = squad_metric.compute(predictions=pred_squad, references=gold_squad)
return results_squad['f1']
@register_aggregation("matthews_corrcoef")
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
......@@ -178,6 +193,16 @@ def exact_match_fn(**kwargs):
return exact_match.compute(**kwargs)
@register_metric(
metric="squad",
higher_is_better=True,
output_type="generate_until",
aggregation="squad_f1"
)
def squad_fn(items):
return items
@register_metric(
metric="perplexity",
higher_is_better=False,
......
......@@ -1294,6 +1294,7 @@ class ConfigurableTask(Task):
**({"f1": (gold, pred)} if "f1" in use_metric else {}),
**({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
**({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
**({"squad": (gold, pred)} if "squad" in use_metric else {}),
**({"exact_match": exact_match} if "exact_match" in use_metric else {}),
**(
{"brier_score": (gold, prob_norm)}
......@@ -1365,18 +1366,23 @@ class ConfigurableTask(Task):
else:
result_score = 0.0
else:
print(gold)
print(result)
print(metric)
try:
result_score = self._metric_fn_list[metric](
references=[gold],
predictions=[result],
**self._metric_fn_kwargs[metric],
)
except TypeError: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
except TypeError as error: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
print(error)
result_score = self._metric_fn_list[metric]([gold, result])
if isinstance(result_score, dict):
# TODO: this handles the case where HF evaluate returns a dict.
result_score = result_score[metric]
result_dict[metric] = result_score
print(f"Result Dict: {result_dict}")
else:
raise ValueError(
f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
......
......@@ -44,13 +44,53 @@ class RegexFilter(Filter):
filtered.append(match)
return filtered
# print(resps)
filtered_resps = list(map(lambda x: filter_set(x), resps))
# print(filtered_resps)
return filtered_resps
@register_filter("regex-numbers")
class RegexFilter(Filter):
""" """
def __init__(
self,
regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
group_select=0,
fallback: str = "[invalid]",
) -> None:
"""
pass a string `regex` to run `re.compile(r"regex")` on.
`fallback` defines the output returned if no matches for the regex are located.
"""
self.regex_pattern = regex_pattern
self.regex = re.compile(regex_pattern)
self.group_select = group_select
self.fallback = fallback
def apply(self, resps, docs):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
# independently (and keep them a list.)
def filter_set(inst):
filtered = []
for resp in inst:
match = self.regex.findall(resp)
if match:
match = match[self.group_select]
if isinstance(match, tuple):
match = [m for m in match if m][0]
match = match.strip().replace(',', '').replace('.', '')
else:
match = self.fallback
filtered.append(match)
return filtered
filtered_resps = list(map(lambda x: filter_set(x), resps))
return filtered_resps
@register_filter("remove_whitespace")
class WhitespaceFilter(Filter):
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment