Unverified Commit 0126f6d1 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

fix formatting (#2759)

parent 96966f53
......@@ -4,4 +4,5 @@ doc_to_choice: '{{[sentence1+", richtig? Nein, "+sentence2, sentence1+", richtig
"+sentence2]}}'
doc_to_text: ''
include: pawsx_template_yaml
process_docs: !function utils.process_docs_paraphrases
task: paws_de
......@@ -3,4 +3,5 @@ dataset_name: en
doc_to_choice: '{{[sentence1+", right? No, "+sentence2, sentence1+", right? Yes, "+sentence2]}}'
doc_to_text: ''
include: pawsx_template_yaml
process_docs: !function utils.process_docs_paraphrases
task: paws_en
......@@ -4,4 +4,5 @@ doc_to_choice: '{{[sentence1+", verdad? No, "+sentence2, sentence1+", verdad? S
"+sentence2]}}'
doc_to_text: ''
include: pawsx_template_yaml
process_docs: !function utils.process_docs_paraphrases
task: paws_es
......@@ -4,4 +4,5 @@ doc_to_choice: '{{[sentence1+", n''est-ce pas? Non, "+sentence2, sentence1+", n'
pas? Oui, "+sentence2]}}'
doc_to_text: ''
include: pawsx_template_yaml
process_docs: !function utils.process_docs_paraphrases
task: paws_fr
......@@ -11,6 +11,7 @@ test_split: test
doc_to_text: null
doc_to_target: label
doc_to_choice: null
target_delimiter: ""
metric_list:
- metric: acc
aggregation: mean
......
import re
def general_detokenize(string):
string = string.replace(" n't", "n't")
string = string.replace(" )", ")")
string = string.replace("( ", "(")
string = string.replace('" ', '"')
string = string.replace(' "', '"')
string = re.sub(r" (['.,])", r"\1", string)
return string
def lowercase_first_letter(text):
return text[0].lower() + text[1:]
def process_docs_paraphrases(dataset):
empty_docs = []
def _process_doc(doc):
if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
# Remove final punctuation mark in the first sentence
if doc["sentence1"].endswith((".", ",", ";")):
doc["sentence1"] = doc["sentence1"][:-1]
# Start the second sentence in lowercase (to be used after "Yes, ...")
doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
return doc
else:
empty_docs.append(doc)
return doc
if empty_docs != []:
len_empty_docs = len(empty_docs)
print(
f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}"
)
return dataset.filter(
lambda doc: doc["sentence1"] not in [None, ""]
and doc["sentence2"] not in [None, ""]
).map(_process_doc)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment