Commit eb9f6788 authored by JessicaOjo's avatar JessicaOjo
Browse files

pr review changes

parent 0dcdfb80
...@@ -58,19 +58,6 @@ def f1_score(items): ...@@ -58,19 +58,6 @@ def f1_score(items):
return np.max(fscore) return np.max(fscore)
@register_aggregation("squad_f1")
def squad_f1_score(items):
gold_squad, pred_squad = [], []
for index, (ref, pred) in enumerate(items):
pred_dict = {'prediction_text': str(pred), 'id': str(index)}
ref_dict = {'answers': {'answer_start': [0], 'text': [str(ref)]}, 'id': str(index)}
gold_squad.append(ref_dict)
pred_squad.append(pred_dict)
squad_metric = hf_evaluate.load("squad")
results_squad = squad_metric.compute(predictions=pred_squad, references=gold_squad)
return results_squad['f1']/100
@register_aggregation("matthews_corrcoef") @register_aggregation("matthews_corrcoef")
def matthews_corrcoef(items): def matthews_corrcoef(items):
unzipped_list = list(zip(*items)) unzipped_list = list(zip(*items))
...@@ -192,15 +179,6 @@ def exact_match_fn(**kwargs): ...@@ -192,15 +179,6 @@ def exact_match_fn(**kwargs):
return exact_match.compute(**kwargs) return exact_match.compute(**kwargs)
@register_metric(
metric="squad",
higher_is_better=True,
output_type="generate_until",
aggregation="squad_f1"
)
def squad_fn(items):
return items
@register_metric( @register_metric(
metric="perplexity", metric="perplexity",
higher_is_better=False, higher_is_better=False,
......
...@@ -1417,7 +1417,6 @@ class ConfigurableTask(Task): ...@@ -1417,7 +1417,6 @@ class ConfigurableTask(Task):
**({"acc": acc} if "acc" in use_metric else {}), **({"acc": acc} if "acc" in use_metric else {}),
**({"f1": (gold, pred)} if "f1" in use_metric else {}), **({"f1": (gold, pred)} if "f1" in use_metric else {}),
**({"mcc": (gold, pred)} if "mcc" in use_metric else {}), **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
**({"squad": (gold, pred)} if "squad" in use_metric else {}),
**({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}), **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
**({"exact_match": exact_match} if "exact_match" in use_metric else {}), **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
**( **(
...@@ -1437,13 +1436,10 @@ class ConfigurableTask(Task): ...@@ -1437,13 +1436,10 @@ class ConfigurableTask(Task):
gold = self.doc_to_target(doc) gold = self.doc_to_target(doc)
result = results[0] result = results[0]
if self.config.doc_to_choice is not None: if self.config.doc_to_choice is not None:
try:
# If you set doc_to_choice, # If you set doc_to_choice,
# it assumes that doc_to_target returns a number. # it assumes that doc_to_target returns a number.
choices = self.doc_to_choice(doc) choices = self.doc_to_choice(doc)
gold = choices[gold] gold = choices[gold]
except TypeError:
gold = gold
# we expect multiple_targets to be a list. # we expect multiple_targets to be a list.
elif self.multiple_target: elif self.multiple_target:
gold = list(gold) gold = list(gold)
...@@ -1492,20 +1488,12 @@ class ConfigurableTask(Task): ...@@ -1492,20 +1488,12 @@ class ConfigurableTask(Task):
result_score = 0.0 result_score = 0.0
else: else:
try: try:
# adds exact match logic
if metric == "exact_match":
result_score = self._metric_fn_list[metric](
references=[str(gold)],
predictions=[str(result)],
**self._metric_fn_kwargs[metric],
)
else:
result_score = self._metric_fn_list[metric]( result_score = self._metric_fn_list[metric](
references=[gold], references=[gold],
predictions=[result], predictions=[result],
**self._metric_fn_kwargs[metric], **self._metric_fn_kwargs[metric],
) )
except TypeError as error: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics except TypeError: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
result_score = self._metric_fn_list[metric]([gold, result]) result_score = self._metric_fn_list[metric]([gold, result])
if isinstance(result_score, dict): if isinstance(result_score, dict):
# TODO: this handles the case where HF evaluate returns a dict. # TODO: this handles the case where HF evaluate returns a dict.
......
...@@ -49,43 +49,6 @@ class RegexFilter(Filter): ...@@ -49,43 +49,6 @@ class RegexFilter(Filter):
return filtered_resps return filtered_resps
@register_filter("verbalizer")
class VerbalizerFilter(Filter):
""" """
def __init__(
self,
verbalizer_dict: dict,
) -> None:
"""
pass a string `regex` to run `re.compile(r"regex")` on.
`fallback` defines the output returned if no matches for the regex are located.
"""
self.verbalizer_dict = verbalizer_dict
def apply(self, resps, docs):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
# independently (and keep them a list.)
def verbalize(value):
for key, values in self.verbalizer_dict.items():
for v in values:
if v in value:
return key
return value
def filter_value(inst):
filtered = []
for resp in inst:
match = verbalize(resp.lower())
filtered.append(match)
return filtered
filtered_resps = map(lambda x: filter_value(x), resps)
return filtered_resps
@register_filter("remove_whitespace") @register_filter("remove_whitespace")
class WhitespaceFilter(Filter): class WhitespaceFilter(Filter):
""" """ """ """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment