add wsc utils

faa2e43d · lintangsutawika · 007d485b · faa2e43d · faa2e43d
Commit faa2e43d authored Aug 24, 2023 by lintangsutawika
Showing with 96 additions and 6 deletions

lm_eval/tasks/super_glue/wsc/t5-prompt.yaml lm_eval/tasks/super_glue/wsc/t5-prompt.yaml +8 -6

lm_eval/tasks/super_glue/wsc/t5_utils.py lm_eval/tasks/super_glue/wsc/t5_utils.py +88 -0

No files found.
--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -2,15 +2,17 @@ group:
  - super-glue-t5-prompt
 task: super_glue-wsc-t5-prompt
 dataset_path: super_glue
-dataset_name: wsc
+dataset_name: wsc.fixed
 training_split: train
 validation_split: validation
 output_type: greedy_until
-doc_to_text: !function "t5_utils.t5_prompt_doc_to_text"
-doc_to_target: "{{[span1_text, span2_text][label]}}"
+doc_to_text: !function "t5_utils.doc_to_text"
+doc_to_target: label
 metric_list:
-  - metric: exact_match
+  - metric: accuracy
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+filter_list:
+  - name: "wsc_postprocessor"
+    filter:
+      - function: !function t5_utils.WSCPostprocess
\ No newline at end of file
--- a/lm_eval/tasks/super_glue/wsc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/wsc/t5_utils.py
+import re
+from lm_eval.api.filter import Filter
+
+def doc_to_text(x):
+    text = re.sub(r" X ", " *"+x["span2_text"]+"* ", _wsc_inputs(x))
+    return "wsc: "+text
+
+
+def _wsc_inputs(x):
+
+    words = x['text'].split(" ")
+
+    # We would need some special logic to handle the case where the pronoun is the
+    # first or last word in the text. None of the examples in WSC seem to have
+    # this, so we are ignoring these cases.
+    assert x['span2_index'] > 0
+    assert x['span2_index'] < len(words)
+    pronoun_index = x['span2_index']
+
+    def create_input():
+        assert words[pronoun_index] == x['span2_text']
+
+        return " ".join([
+              " ".join(
+                words[:pronoun_index]
+                ),
+              'X',
+              " ".join(
+                words[pronoun_index + 1:]
+                ),
+            ])
+
+    # Handle some special cases.
+    if x['text'] == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. \"Good for him,\" he said. ':
+        return (
+            'The boy continued to whip the pony , and eventually the pony threw '
+            'him over. John laughed out quite loud. "Good for X ," he said.'
+        )
+
+    # Using the span2_index, we get 'use' instead of 'it'.
+    if x['text'] == 'When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?':
+        return (
+            'When they had eventually calmed down a bit , and had gotten home, '
+            'Mr. Farley put the magic pebble in an iron safe . Some day they might '
+            'want to use X , but really for now, what more could they wish for?'
+        )
+
+    return create_input()
+
+
+class WSCPostprocess(Filter):
+
+    def __init__(self, **kwargs):
+
+        self.determiners = {
+            "a", "an", "few", "her", "his", "each", "every", "many", "much", "my",
+            "our", "some", "that", "the", "their", "these", "this", "those", "which",
+            "whose", "your"
+        }
+
+    def clean(self, s):
+        """Ignore capitalization and determiners."""
+        s = s.strip().lower()
+        return " ".join([w for w in s.split(" ") if w not in self.determiners])
+
+    def apply(self, resps, docs):
+
+        filtered_resps = []
+        for prediction, reference in zip(*(resps, docs["span1_text"])):
+
+            prediction = self.clean(prediction[0])
+            reference = self.clean(reference)
+
+            if ("'" in prediction) != ("'" in reference):
+                # referent is "Bob's hat" as predicting the referent.
+                predicted_referent = False
+            else:
+                prediction_words = set(prediction.split(" "))
+                referent_words = set(reference.split(" "))
+
+                # Handle cases where the prediction is "fuzzy bunny" and the referent is
+                # "bunny".
+                predicted_referent = prediction_words.issubset(
+                    referent_words) or referent_words.issubset(prediction_words)
+
+            filtered_resps.append(predicted_referent)
+
+        return filtered_resps