add squad metric

6ca56eac · JessicaOjo · 187ab735 · 6ca56eac · 6ca56eac · 6ca56eac
Commit 6ca56eac authored May 10, 2024 by JessicaOjo
Hide whitespace changes
Inline Side-by-side

Showing with 74 additions and 3 deletions

lm_eval/api/metrics.py lm_eval/api/metrics.py +25 -0

lm_eval/api/task.py lm_eval/api/task.py +7 -1

lm_eval/filters/extraction.py lm_eval/filters/extraction.py +42 -2

No files found.
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -58,6 +58,21 @@ def f1_score(items):
    return np.max(fscore)


+@register_aggregation("squad_f1")
+def squad_f1_score(items):
+
+    gold_squad, pred_squad = [], []
+    for index, (ref, pred) in enumerate(items):
+        pred_dict = {'prediction_text': pred, 'id': str(index)}
+        ref_dict = {'answers': {'answer_start': [0], 'text': [ref]}, 'id': str(index)}
+        gold_squad.append(ref_dict)
+        pred_squad.append(pred_dict)
+
+    squad_metric = hf_evaluate.load("squad")
+    results_squad = squad_metric.compute(predictions=pred_squad, references=gold_squad)
+    return results_squad['f1']
+
+
 @register_aggregation("matthews_corrcoef")
 def matthews_corrcoef(items):
    unzipped_list = list(zip(*items))
@@ -178,6 +193,16 @@ def exact_match_fn(**kwargs):
    return exact_match.compute(**kwargs)


+@register_metric(
+    metric="squad",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="squad_f1"
+)
+def squad_fn(items):
+    return items
+
+
 @register_metric(
    metric="perplexity",
    higher_is_better=False,

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1294,6 +1294,7 @@ class ConfigurableTask(Task):
                **({"f1": (gold, pred)} if "f1" in use_metric else {}),
                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"squad": (gold, pred)} if "squad" in use_metric else {}),
                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
                **(
                    {"brier_score": (gold, prob_norm)}
@@ -1365,18 +1366,23 @@ class ConfigurableTask(Task):
                        else:
                            result_score = 0.0
                else:
+                    print(gold)
+                    print(result)
+                    print(metric)
                    try:
                        result_score = self._metric_fn_list[metric](
                            references=[gold],
                            predictions=[result],
                            **self._metric_fn_kwargs[metric],
                        )
-                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                    except TypeError as error:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                        print(error)
                        result_score = self._metric_fn_list[metric]([gold, result])
                    if isinstance(result_score, dict):
                        # TODO: this handles the case where HF evaluate returns a dict.
                        result_score = result_score[metric]
                result_dict[metric] = result_score
+                print(f"Result Dict: {result_dict}")
        else:
            raise ValueError(
                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -44,13 +44,53 @@ class RegexFilter(Filter):
                filtered.append(match)
            return filtered

-        # print(resps)
        filtered_resps = list(map(lambda x: filter_set(x), resps))
-        # print(filtered_resps)

        return filtered_resps


+@register_filter("regex-numbers")
+class RegexFilter(Filter):
+    """ """
+
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+    ) -> None:
+        """
+        pass a string `regex` to run `re.compile(r"regex")` on.
+        `fallback` defines the output returned if no matches for the regex are located.
+        """
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
+        self.fallback = fallback
+
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+        def filter_set(inst):
+            filtered = []
+            for resp in inst:
+                match = self.regex.findall(resp)
+                if match:
+                    match = match[self.group_select]
+                    if isinstance(match, tuple):
+                        match = [m for m in match if m][0]
+                    match = match.strip().replace(',', '').replace('.', '')
+                else:
+                    match = self.fallback
+                filtered.append(match)
+            return filtered
+
+        filtered_resps = list(map(lambda x: filter_set(x), resps))
+        return filtered_resps
+
+
 @register_filter("remove_whitespace")
 class WhitespaceFilter(Filter):
    """ """