Update mlqa.py

a3f10268 · sdtblck · GitHub · c7173ca3 · a3f10268
Unverified Commit a3f10268 authored Jun 17, 2021 by sdtblck Committed by GitHub Jun 17, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 9 deletions

lm_eval/tasks/mlqa.py lm_eval/tasks/mlqa.py +5 -9

No files found.
--- a/lm_eval/tasks/mlqa.py
+++ b/lm_eval/tasks/mlqa.py
@@ -170,7 +170,7 @@ def mlqa_metric(predictions, references, answer_lang):
    return evaluate(dataset, pred_dict, answer_lang)


-def mlqa_agg(key, items, answer_lang):
+def mlqa_agg(items, key,  answer_lang):
    predictions, references = zip(*items)
    return mlqa_metric(predictions=predictions, references=references, answer_lang=answer_lang)[key]

@@ -218,8 +218,7 @@ class MLQABase(HFTask):
            part of the document for `doc`.
        """
        continuation = rf.greedy_until(ctx, ['\n'])
-        is_unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
-        return continuation, is_unanswerable
+        return continuation

    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
@@ -233,12 +232,9 @@ class MLQABase(HFTask):
        """
        continuation, (logprob_unanswerable, _) = results

-        no_answer_probability = exp(logprob_unanswerable)
-
        predictions = {
            'id': doc['id'],
            'prediction_text': continuation,
-            'no_answer_probability': no_answer_probability,
        }

        references = {
@@ -247,7 +243,7 @@ class MLQABase(HFTask):
        }

        return {
-            'exact': (predictions, references),  # Exact match (the normalized answer exactly match the gold answer)
+            'exact_match': (predictions, references),  # Exact match (the normalized answer exactly match the gold answer)
            'f1': (predictions, references),  # The F-score of predicted tokens versus the gold answer
        }

@@ -258,7 +254,7 @@ class MLQABase(HFTask):
            functions that aggregate a list of metrics
        """
        return {
-            'exact': partial(mlqa_agg, key='exact', answer_lang=self.ANSWER_LANG),  # Exact match (the normalized
+            'exact_match': partial(mlqa_agg, key='exact_match', answer_lang=self.ANSWER_LANG),  # Exact match (the normalized
            # answer exactly match the gold answer)
            'f1': partial(mlqa_agg, key='f1', answer_lang=self.ANSWER_LANG),  # The F-score of predicted tokens
            # versus the gold answer
@@ -271,7 +267,7 @@ class MLQABase(HFTask):
            whether a higher value of the submetric is better
        """
        return {
-            'exact': True,  # Exact match (the normalized answer exactly match the gold answer)
+            'exact_match': True,  # Exact match (the normalized answer exactly match the gold answer)
            'f1': True,  # The F-score of predicted tokens versus the gold answer
        }