fix formatting (#2759)

0126f6d1 · Baber Abbasi · GitHub · 96966f53 · 0126f6d1 · 0126f6d1
Unverified Commit 0126f6d1 authored May 15, 2025 by Baber Abbasi Committed by GitHub May 15, 2025
6 changed files
--- a/lm_eval/tasks/paws-x/paws_de.yaml
+++ b/lm_eval/tasks/paws-x/paws_de.yaml
@@ -4,4 +4,5 @@ doc_to_choice: '{{[sentence1+", richtig? Nein, "+sentence2, sentence1+", richtig
  "+sentence2]}}'
 doc_to_text: ''
 include: pawsx_template_yaml
+process_docs: !function utils.process_docs_paraphrases
 task: paws_de
--- a/lm_eval/tasks/paws-x/paws_en.yaml
+++ b/lm_eval/tasks/paws-x/paws_en.yaml
@@ -3,4 +3,5 @@ dataset_name: en
 doc_to_choice: '{{[sentence1+", right? No, "+sentence2, sentence1+", right? Yes, "+sentence2]}}'
 doc_to_text: ''
 include: pawsx_template_yaml
+process_docs: !function utils.process_docs_paraphrases
 task: paws_en
--- a/lm_eval/tasks/paws-x/paws_es.yaml
+++ b/lm_eval/tasks/paws-x/paws_es.yaml
@@ -4,4 +4,5 @@ doc_to_choice: '{{[sentence1+", verdad? No, "+sentence2, sentence1+", verdad? S
  "+sentence2]}}'
 doc_to_text: ''
 include: pawsx_template_yaml
+process_docs: !function utils.process_docs_paraphrases
 task: paws_es
--- a/lm_eval/tasks/paws-x/paws_fr.yaml
+++ b/lm_eval/tasks/paws-x/paws_fr.yaml
@@ -4,4 +4,5 @@ doc_to_choice: '{{[sentence1+", n''est-ce pas? Non, "+sentence2, sentence1+", n'
  pas? Oui, "+sentence2]}}'
 doc_to_text: ''
 include: pawsx_template_yaml
+process_docs: !function utils.process_docs_paraphrases
 task: paws_fr
--- a/lm_eval/tasks/paws-x/pawsx_template_yaml
+++ b/lm_eval/tasks/paws-x/pawsx_template_yaml
@@ -11,6 +11,7 @@ test_split: test
 doc_to_text: null
 doc_to_target: label
 doc_to_choice: null
+target_delimiter: ""
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/paws-x/utils.py
+++ b/lm_eval/tasks/paws-x/utils.py
+import re
+
+
+def general_detokenize(string):
+    string = string.replace(" n't", "n't")
+    string = string.replace(" )", ")")
+    string = string.replace("( ", "(")
+    string = string.replace('" ', '"')
+    string = string.replace(' "', '"')
+    string = re.sub(r" (['.,])", r"\1", string)
+    return string
+
+
+def lowercase_first_letter(text):
+    return text[0].lower() + text[1:]
+
+
+def process_docs_paraphrases(dataset):
+    empty_docs = []
+
+    def _process_doc(doc):
+        if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
+            doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
+            doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
+            # Remove final punctuation mark in the first sentence
+            if doc["sentence1"].endswith((".", ",", ";")):
+                doc["sentence1"] = doc["sentence1"][:-1]
+            # Start the second sentence in lowercase (to be used after "Yes, ...")
+            doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
+            return doc
+        else:
+            empty_docs.append(doc)
+            return doc
+
+    if empty_docs != []:
+        len_empty_docs = len(empty_docs)
+        print(
+            f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}"
+        )
+    return dataset.filter(
+        lambda doc: doc["sentence1"] not in [None, ""]
+        and doc["sentence2"] not in [None, ""]
+    ).map(_process_doc)